// Copyright (C)  2000 Intel Corporation.  All rights reserved.
//
// $Header: /usr/development/orp/orp/arch/ia32/base/jit_runtime_support_ia32.cpp,v 1.39 2001/12/07 00:16:00 xli18 Exp $
//



#include "platform.h"
#include <iostream.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <float.h>
#include <math.h>

#include "object_layout.h"
#include "orp_types.h"
#include "Class.h"
#include "environment.h"
#include "method_lookup.h"
#include "stack_manipulation.h"
#include "exceptions.h"
#include "orp_synch.h"
#include "gc_for_orp.h"
#include "root_set_enum.h"
#include "ini.h"
#include "nogc.h"
#include "../x86/x86.h"
#include "orp_utils.h"
#include "orp_threads.h"
#include "mon_enter_exit.h"

#ifdef OBJECT_LOCK_V2
#include "mon_enter_exit_olv2.h"
#else
#include "mon_enter_exit.h"
#endif

#include "sync_bits.h"

#include "orp_stats.h"
#include "internal_jit_intf.h"
#include "jit_runtime_support_common.h"
#include "jit_runtime_support.h"

#include "jvmdi_clean.h"

#ifdef ORP_POSIX
#include "platform2.h"
#endif

#ifdef ORP_VTUNE_SUPPORT
//M:
#include "orp_vtune.h"
#endif


//forward declaration
char *gen_setup_j2n_frame(char *s);
char *gen_pop_j2n_frame(char *s);
void * getaddress__setup_java_to_native_frame();
void * getaddress__pop_java_to_native_frame();
void * getaddress__orp_instanceof_class();

void * getaddress__orp_monitor_enter_naked();
void * getaddress__orp_monitor_exit_naked();

/////////////////////////////////////////////////////////////////
// begin ORP_Runtime_Support
/////////////////////////////////////////////////////////////////
CriticalSection cs;


static void orp_throw_java_lang_ClassCastException()
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

    throw_java_exception("java/lang/ClassCastException");
    assert(0);
} //orp_throw_java_lang_ClassCastException



static void *getaddress__orp_checkcast_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 46;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;    

    ss = mov(ss, &eax_opnd, &M_Base_Opnd(esp_reg, +4) );
    ss = alu(ss, or_opc, &eax_opnd, &eax_opnd);

    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 1);  // will get backpatched
    char *backpatch_address__not_null = ((char *)ss) - 1;
    ss = ret(ss, &Imm_Opnd(8));

    signed offset = (signed)ss - (signed)backpatch_address__not_null - 1;
    *backpatch_address__not_null = offset;

    ss = push(ss, &M_Base_Opnd(esp_reg, 8));
    ss = push(ss, &eax_opnd);
    ss = call(ss, (char *)orp_instanceof);
    // Right now it's an stdcall function, but we should fix it later.
    //ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(8));

    ss = alu(ss, or_opc, &eax_opnd, &eax_opnd);
    ss = branch8(ss, cc_eq, &Imm_Opnd(0), 1);  // will get backpatched
    char *backpatch_address__instanceof_failed = ((char *)ss) - 1;
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(esp_reg, +4) );
    ss = ret(ss, &Imm_Opnd(8));

    offset = (signed)ss - (signed)backpatch_address__instanceof_failed - 1;
    *backpatch_address__instanceof_failed = offset;

    ss = gen_setup_j2n_frame(ss);

    ss = call(ss, (char *)orp_throw_java_lang_ClassCastException);

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_checkcast_naked",(Byte*) stub,ss - stub);
#endif
    return addr;
}  //getaddress__orp_checkcast_naked



void *getaddress__orp_instanceof_class()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 49;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = mov(ss, &edx_opnd, &M_Base_Opnd(esp_reg, 8));
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, 4));

    ss = alu(ss, cmp_opc, &ecx_opnd, &edx_opnd);
    ss = branch8(ss, cc_ne, &Imm_Opnd(0), 0);
    char *backpatch_address__not_equal1 = ((char *)ss) - 1;
    ss = mov(ss, &eax_opnd, &Imm_Opnd(1) );
    ss = ret(ss);    

    signed offset = (signed)ss - (signed)backpatch_address__not_equal1 - 1;
    *backpatch_address__not_equal1 = offset;    

    ss = mov(ss, &eax_opnd, &ecx_opnd );
    char *address__not_equal = (char *)ss;

    ss = alu(ss, or_opc, &eax_opnd, &eax_opnd);

    ss = branch8(ss, cc_eq, &Imm_Opnd(0), 0);
    char *backpatch_address__no_compatible_super = ((char *)ss) - 1;

    ss = mov(ss, &eax_opnd, &M_Base_Opnd(eax_reg, +4) );
    ss = alu(ss, cmp_opc, &eax_opnd, &edx_opnd);

    offset = (signed)address__not_equal - (signed)ss - 2;
    ss = branch8(ss, cc_ne, &Imm_Opnd(offset), 0);

    ss = mov(ss, &eax_opnd, &Imm_Opnd(1));
    ss = ret(ss); 

    offset = (signed)ss - (signed)backpatch_address__no_compatible_super - 1;
    *backpatch_address__no_compatible_super = offset;    

    ss = push(ss, &edx_opnd);
    ss = push(ss, &ecx_opnd);
    ss = call(ss, (char *)orp_instanceof_class);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(8));
    ss = ret(ss); 

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_instanceof_class",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_instanceof_class



static Boolean orp_is_class_initialized(Class *clss)
{
#ifdef ORP_STATS
    orp_stats_total.num_is_class_initialized++;
    clss->num_class_init_checks++;
#endif
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

    return clss->state == ST_Initialized;
} //orp_is_class_initialized



static void *getaddress__orp_initialize_class_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 45;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = push(ss, &M_Base_Opnd(esp_reg, 4));
    ss = call(ss, (char *)orp_is_class_initialized);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(4));

    ss = alu(ss, or_opc, &eax_opnd, &eax_opnd);

    ss = branch8(ss, cc_eq, &Imm_Opnd(0), 0);
    char *backpatch_address__class_not_initialized = ((char *)ss) - 1;
 
    ss = ret(ss, &Imm_Opnd(4));

    signed offset = (signed)ss - (signed)backpatch_address__class_not_initialized - 1;
    *backpatch_address__class_not_initialized = offset;

    ss = gen_setup_j2n_frame(ss);

    ss = push(ss, &M_Base_Opnd(esp_reg, sizeof(J2N_Saved_State)));
    ss = call(ss, (char *)class_initialize);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(4));

    ss = gen_pop_j2n_frame(ss);
    ss = ret(ss, &Imm_Opnd(4));

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: WangYong
    vtune_notify_stub_load_finished("getaddress__orp_initialize_class_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_initialize_class_naked



//////////////////////////////////////////////////////////////////////
// Object allocation
//////////////////////////////////////////////////////////////////////

Java_java_lang_Object *class_alloc_new_object_and_run_default_constructor(Class *clss)
{
    return class_alloc_new_object_and_run_constructor(clss, 0, 0);
} //class_alloc_new_object_and_run_default_constructor




Java_java_lang_Object *
class_alloc_new_object_and_run_constructor(Class *clss,
                                           Method *constructor,
                                           uint8 *constructor_args)
{
    assert(strcmp(clss->name->bytes, "java/lang/Class"));
 
    volatile void *obj = gc_malloc(clss->instance_data_size, (Partial_Reveal_VTable *)clss->vtable);
 
    GC_Frame gcf;
    orp_push_gc_frame(&gcf, (void *)&obj, sizeof(volatile void *)); 

    if(!constructor) {
        // Get the default constructor
        Global_Env *env = ORP_Global_State::loader_env;
        constructor = class_lookup_method(clss, env->Default_Constructor_Signature);
        assert(constructor);
    }


    // Every argument is at least 4 bytes long
    int num_args_estimate = constructor->get_num_arg_bytes() / 4;
    J_Value *args = (J_Value *)malloc(num_args_estimate * sizeof(J_Value));
    args[0].r = (J_Reference *)obj;

    int arg_num = 1;
    uint8 *argp = constructor_args;
    Arg_List_Iterator iter = constructor->get_argument_list();
    Java_Type typ;
    while((typ = curr_arg(iter)) != JAVA_TYPE_END) {
        switch(typ) {
        case JAVA_TYPE_BOOLEAN:
            args[arg_num].z = *(J_Boolean *)argp;
            argp -= sizeof(J_Boolean);
            break;
        case JAVA_TYPE_BYTE:
            args[arg_num].b = *(J_Byte *)argp;
            argp -= sizeof(J_Byte);
            break;
        case JAVA_TYPE_CHAR:
            args[arg_num].c = *(J_Char *)argp;
            argp -= sizeof(J_Char);
            break;
        case JAVA_TYPE_SHORT:
            args[arg_num].s = *(J_Short *)argp;
            argp -= sizeof(J_Short);
            break;
        case JAVA_TYPE_INT:
            args[arg_num].i = *(J_Int *)argp;
            argp -= sizeof(J_Int);
            break;
        case JAVA_TYPE_LONG:
            args[arg_num].j = *(J_Long *)argp;
            argp -= sizeof(J_Long);
            break;
        case JAVA_TYPE_DOUBLE:
            args[arg_num].d = *(J_Double *)argp;
            argp -= sizeof(J_Double);
            break;
        case JAVA_TYPE_FLOAT:
            args[arg_num].f = *(J_Float *)argp;
            argp -= sizeof(J_Float);
            break;
        case JAVA_TYPE_CLASS:
        case JAVA_TYPE_ARRAY:
            args[arg_num].r = *(J_Reference *)argp;
            argp -= sizeof(J_Reference);
            break;
        default:
            assert(0);
            break;
        }
        iter = advance_arg_iterator(iter);
        arg_num++;
        assert(arg_num <= num_args_estimate);
    }

    orp_execute_java_method_array(constructor, 0, args);

    if(get_current_thread_exception()) {
        // Panic.
        assert(0);
    }

    orp_pop_gc_frame(&gcf);

    return (Java_java_lang_Object *)obj;
} //class_alloc_new_object_and_run_constructor




Java_java_lang_Object *class_alloc_new_object(Class *c)
{
#ifdef ORP_STATS
    orp_stats_total.num_class_alloc_new_object++;
#endif
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

    assert(strcmp(c->name->bytes, "java/lang/Class"));
 
    return (Java_java_lang_Object *)gc_malloc(c->instance_data_size, (Partial_Reveal_VTable *)c->vtable);
 
} //class_alloc_new_object



static Java_java_lang_Object *
class_alloc_new_object_with_finalizer(Class *c)
{
 
    // Checks
    unsigned int class_constraints = c->class_properties;
    unsigned int real_size = get_instance_data_size (c);
    
    assert(c->has_finalizer);
    assert(get_prop_finalizable (class_constraints));
    assert(real_size != c->instance_data_size);
    // end checks

    // ORP_STATS counts need to be redone inside the gc.
    // Finalization is an class property.
    // Perhaps an assert to make sure that c is a class that finalizes.
    return class_alloc_new_object(c);
} //class_alloc_new_object_with_finalizer



static void *getaddress__orp_alloc_java_object_resolved_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 45;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = push(ss, &M_Base_Opnd(esp_reg, 4));
    ss = call(ss, (char *)class_alloc_new_object_or_null);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(4));

    ss = alu(ss, or_opc, &eax_opnd, &eax_opnd);

    ss = branch8(ss, cc_eq, &Imm_Opnd(0), 0);
    char *backpatch_address__fast_alloc_failed = ((char *)ss) - 1;
    ss = ret(ss, &Imm_Opnd(4));

    signed offset = (signed)ss - (signed)backpatch_address__fast_alloc_failed - 1;
    *backpatch_address__fast_alloc_failed = offset;
    
    ss = gen_setup_j2n_frame(ss);
    ss = push(ss, &M_Base_Opnd(esp_reg, sizeof(J2N_Saved_State) ) );
    ss = call(ss, (char *)class_alloc_new_object);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(4));

    ss = gen_pop_j2n_frame(ss);
    ss = ret(ss, &Imm_Opnd(4));

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_alloc_java_object_resolved_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_alloc_java_object_resolved_naked


static void *getaddress__orp_alloc_java_object_with_finalizer_resolved_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 45;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = push(ss, &M_Base_Opnd(esp_reg, 4));
    ss = call(ss, (char *)class_alloc_new_object_with_finalizer_or_null);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(4));

    ss = alu(ss, or_opc, &eax_opnd, &eax_opnd);
    ss = branch8(ss, cc_eq, &Imm_Opnd(0), 0);
    char *backpatch_address__fast_alloc_failed = ((char *)ss) - 1;

    ss = ret(ss, &Imm_Opnd(4));

    signed offset = (signed)ss - (signed)backpatch_address__fast_alloc_failed - 1;
    *backpatch_address__fast_alloc_failed = offset;

    ss = gen_setup_j2n_frame(ss);
    ss = push(ss, &M_Base_Opnd(esp_reg, sizeof(J2N_Saved_State) ) );
    ss = call(ss, (char *)class_alloc_new_object_with_finalizer);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(4));
    ss = gen_pop_j2n_frame(ss);
    ss = ret(ss, &Imm_Opnd(4));
    
    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_alloc_java_object_with_finalizer_resolved_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_alloc_java_object_with_finalizer_resolved_naked



#define AASTORE_EXC_CODE_NULLPOINTER            1
#define AASTORE_EXC_CODE_ARRAYINDEXOUTOFBOUNDS  2
#define AASTORE_EXC_CODE_ARRAYSTORE             3


static void __stdcall
orp_aastore_exception(volatile uint32 exc_code,
                      volatile uint32 eip_arg,
                      volatile uint32 edi_arg,
                      volatile uint32 esi_arg,
                      volatile uint32 ebx_arg,
                      volatile uint32 ebp_arg,
                      volatile uint32 esp_arg)
{
    Global_Env *env = ORP_Global_State::loader_env;
    Class *exc = 0;
    switch(exc_code) {
    case AASTORE_EXC_CODE_NULLPOINTER:
        exc = env->java_lang_NullPointerException_Class;
        break;
    case AASTORE_EXC_CODE_ARRAYINDEXOUTOFBOUNDS:
        exc = env->java_lang_ArrayIndexOutOfBoundsException_Class;
        break;
    case AASTORE_EXC_CODE_ARRAYSTORE:
        exc = env->java_lang_ArrayStoreException_Class;
        break;
    default:
        assert(0);
        break;
    }

    assert(exc);
    eip_arg -= 5;

    orp_athrow(TRUE,
               edi_arg,
               esi_arg,
               ebx_arg,
               ebp_arg,
               esp_arg,
               eip_arg,
               (Java_java_lang_Object *)exc);
} //orp_aastore_exception


static void *getaddress__orp_aastore_exception_trampoline()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 19;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = alu(ss, sub_opc, &esp_opnd, &Imm_Opnd(16));
    ss = push(ss, &esp_opnd);
    ss = push(ss, &ebp_opnd);
    ss = push(ss, &ebx_opnd);
    ss = push(ss, &esi_opnd);
    ss = push(ss, &edi_opnd);
    ss = push(ss, &eax_opnd);
    ss = push(ss, &ecx_opnd);
    ss = alu(ss, sub_opc, &esp_opnd, &Imm_Opnd(4));
    ss = jump(ss, (char *)orp_aastore_exception);

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: WangYong
    vtune_notify_stub_load_finished("getaddress__orp_aastore_exception_trampoline",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_aastore_exception_trampoline


static void *getaddress__orp_aastore_exception_nullpointer()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 11;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = mov(ss, &ecx_opnd, &Imm_Opnd(AASTORE_EXC_CODE_NULLPOINTER) );
    ss = jump(ss, (char *)getaddress__orp_aastore_exception_trampoline() );
    
    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_aastore_exception_nullpointer",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_aastore_exception_nullpointer


static void *getaddress__orp_aastore_exception_arrayindexoutofbounds()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 11;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = mov(ss, &ecx_opnd, &Imm_Opnd(AASTORE_EXC_CODE_ARRAYINDEXOUTOFBOUNDS) );
    ss = jump(ss, (char *)getaddress__orp_aastore_exception_trampoline() );
   
    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M:
    vtune_notify_stub_load_finished("getaddress__orp_aastore_exception_arrayindexoutofbounds",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_aastore_exception_arrayindexoutofbounds


static void *getaddress__orp_aastore_exception_arraystore()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 11;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = mov(ss, &ecx_opnd, &Imm_Opnd(AASTORE_EXC_CODE_ARRAYSTORE) );
    ss = jump(ss, (char *)getaddress__orp_aastore_exception_trampoline() );
    
    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_aastore_exception_arraystore",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_aastore_exception_arraystore


static void *__stdcall
orp_aastore(volatile Java_java_lang_Object *elem,
            int idx,
            JavaArrayOfObject *array) stdcall__;

static void *__stdcall
orp_aastore(volatile Java_java_lang_Object *elem,
            int idx,
            JavaArrayOfObject *array)
{
    assert ((elem == NULL) || (elem->vt != NULL));
#ifdef ORP_STATS
    orp_stats_total.num_aastores++;
#endif
    void *new_eip = 0;
    if(!array) {
        new_eip = getaddress__orp_aastore_exception_nullpointer();
    } else if((unsigned)array->length <= (unsigned)idx) {
        new_eip = getaddress__orp_aastore_exception_arrayindexoutofbounds();
    } else {
        assert(idx >= 0);
        if(elem) {
#ifdef ORP_STATS
            orp_stats_total.num_instanceof_in_aastore++;
#endif
            Class *array_class = array->vt->clss;
            assert(array_class);
            assert(array_class->is_array);

#if 1
            // Use fastinstof to eliminate unnecessary calls to
            // orp_instanceof().
            Class *elem_class = array_class->array_element_class;
            int depth = elem_class->depth;
            Class **superclasses = elem->vt->superclasses;
            if(!elem_class->is_array &&
               !(elem_class->access_flags & ACC_INTERFACE) &&
               depth < MAX_FAST_INSTOF_DEPTH) {
                if(!depth || superclasses[depth - 1] == elem_class) {
                    gc_heap_slot_write_ref ((Java_java_lang_Object *)array,
                        (Java_java_lang_Object **)&(array->body[idx]),
                        (Java_java_lang_Object *) elem);

                    return 0;
                }
            }
#endif
            if (orp_instanceof((Java_java_lang_Object *) elem, array_class->array_element_class)) {
                
                gc_heap_slot_write_ref ((Java_java_lang_Object *)array,
                    (Java_java_lang_Object **)&(array->body[idx]),
                    (Java_java_lang_Object *) elem);

                return 0;
            }
            new_eip = (void *) getaddress__orp_aastore_exception_arraystore();
        } else {
            assert(elem == NULL);
            // Don't have to check types for a null reference
//            array->body[idx] = (Java_java_lang_Object *)elem;
            gc_heap_slot_write_ref ((Java_java_lang_Object *)array,
                (Java_java_lang_Object **)&(array->body[idx]),
                (Java_java_lang_Object *) elem);
            return 0;
        }
    }

#ifdef ORP_STATS
    orp_stats_total.num_aastore_exceptions++;
#endif

    //
    // This may possibly break if the C compiler applies very aggresive
    // optimizations.
    //
    void **saved_eip = ((void **)&elem) - 1;
    void *old_eip = *saved_eip;
    *saved_eip = new_eip;
    return old_eip;
} //orp_aastore



static void *getaddress__orp_anewarray_resolved_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 30;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = gen_setup_j2n_frame(ss);

    ss = push(ss, &M_Base_Opnd(esp_reg, (sizeof(J2N_Saved_State) + 4)));
    ss = push(ss, &M_Base_Opnd(esp_reg, (sizeof(J2N_Saved_State) + 4)));
    ss = call(ss, (char *)orp_anewarray_resolved_array_type);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(8));

    ss = gen_pop_j2n_frame(ss);
    ss = ret(ss, &Imm_Opnd(8));

    assert((ss - stub) < stub_size);
    addr = stub;
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_anewarray_resolved_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_anewarray_resolved_naked


static void * getaddress__orp_newarray_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 30;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = gen_setup_j2n_frame(ss);

    ss = push(ss, &M_Base_Opnd(esp_reg, (sizeof(J2N_Saved_State) + 4)));
    ss = push(ss, &M_Base_Opnd(esp_reg, (sizeof(J2N_Saved_State) + 4)));
    ss = call(ss, (char *)orp_newarray);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(8));
    
    ss = gen_pop_j2n_frame(ss);
    ss = ret(ss, &Imm_Opnd(8));
    
    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M:
    vtune_notify_stub_load_finished("getaddress__orp_newarray_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_newarray_naked



static void * getaddress__orp_new_vector_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 30;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = gen_setup_j2n_frame(ss);

    ss = push(ss, &M_Base_Opnd(esp_reg, (sizeof(J2N_Saved_State) + 4)));
    ss = push(ss, &M_Base_Opnd(esp_reg, (sizeof(J2N_Saved_State) + 4)));
    ss = call(ss, (char *)orp_new_vector);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(8));
    
    ss = gen_pop_j2n_frame(ss);
    ss = ret(ss, &Imm_Opnd(8));
    
    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_new_vector_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_new_vector_naked





//
// This is __cdecl function and the caller must pop the arguments.
//


static void *getaddress__orp_multianewarray_resolved_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 48;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = gen_setup_j2n_frame(ss);
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, (sizeof(J2N_Saved_State)+4)) );

    ss = lea(ss, &eax_opnd, &M_Index_Opnd(esp_reg, ecx_reg, (sizeof(J2N_Saved_State)+4), 2) );
    
    char *address_push_count_arg = (char *)ss;

    ss = push(ss, &M_Base_Opnd(eax_reg, 0) );
    ss = alu(ss, sub_opc, &eax_opnd, &Imm_Opnd(4));
    ss = dec(ss, &ecx_opnd);

    signed offset = (signed)address_push_count_arg - (signed)ss - 2;
    ss = branch8(ss, cc_ne, &Imm_Opnd(offset), 0);

    ss = push(ss, &M_Base_Opnd(eax_reg, 0) );
    ss = push(ss, &M_Base_Opnd(eax_reg, -4) );

    ss = call(ss, (char *)orp_multianewarray_resolved);

    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, +4) );

    ss = lea(ss, &esp_opnd, &M_Index_Opnd(esp_reg, ecx_reg, +8, 2) );

    ss = gen_pop_j2n_frame(ss);
    ss = ret(ss);

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_multianewarray_resolved_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_multianewarray_resolved_naked


static void *getaddress__orp_instantiate_cp_string_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 54;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = push(ss, &M_Base_Opnd(esp_reg, 8));
    ss = push(ss, &M_Base_Opnd(esp_reg, 8));
    ss = call(ss, (char *)orp_instantiate_cp_string_fast);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(8));

    ss = alu(ss, or_opc, &eax_opnd, &eax_opnd);

    ss = branch8(ss, cc_eq, &Imm_Opnd(0), 0);
    char *backpatch_address__out_of_memory = ((char *)ss) - 1;
    
    ss = ret(ss, &Imm_Opnd(8));

    signed offset = (signed)ss - (signed)backpatch_address__out_of_memory - 1;
    *backpatch_address__out_of_memory = offset;

    ss = gen_setup_j2n_frame(ss);

    ss = push(ss, &M_Base_Opnd(esp_reg, (sizeof(J2N_Saved_State)+4) ) );
    ss = push(ss, &M_Base_Opnd(esp_reg, (sizeof(J2N_Saved_State)+4) ) );

    ss = call(ss, (char *)orp_instantiate_cp_string_slow);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(8));

    ss = gen_pop_j2n_frame(ss);

    ss = ret(ss, &Imm_Opnd(8) );

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_instantiate_cp_string_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_instantiate_cp_string_naked



static void orp_throw_java_lang_IncompatibleClassChangeError()
{
    throw_java_exception("java/lang/IncompatibleClassChangeError");
    assert(0);
} //orp_throw_java_lang_IncompatibleClassChangeError


void * getaddress__orp_get_interface_vtable_old_naked()  //wjw verify that this works
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 51;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = mov(ss, &edx_opnd, &M_Base_Opnd(esp_reg, +8) );
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(esp_reg, +4) );
    ss = alu(ss, or_opc, &ecx_opnd, &ecx_opnd);

    ss = branch8(ss, cc_eq, &Imm_Opnd(0), 0);
    char *backpatch_address__null_reference = ((char *)ss) - 1;

    ss = push(ss, &M_Base_Opnd(esp_reg, 8));
    ss = push(ss, &M_Base_Opnd(esp_reg, 8));
    ss = call(ss, (char *)orp_get_interface_vtable);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(8));

    ss = alu(ss, or_opc, &eax_opnd, &eax_opnd);

    ss = branch8(ss, cc_eq, &Imm_Opnd(0), 0);
    char *backpatch_address__interface_not_found = ((char *)ss) - 1;

    ss = ret(ss, &Imm_Opnd(8) );

    signed offset = (signed)ss - (signed)backpatch_address__interface_not_found - 1;
    *backpatch_address__interface_not_found = offset;

    ss = gen_setup_j2n_frame(ss);

    ss = call(ss, (char *)orp_throw_java_lang_IncompatibleClassChangeError);

    offset = (signed)ss - (signed)backpatch_address__null_reference - 1;
    *backpatch_address__null_reference = offset;

    ss = alu(ss, xor_opc, &eax_opnd, &eax_opnd);
    ss = ret(ss, &Imm_Opnd(8) );
    
    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M:
    vtune_notify_stub_load_finished("getaddress__orp_get_interface_vtable_old_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_get_interface_vtable_old_naked



static void orp_throw_java_lang_ArithmeticException()
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

    throw_java_exception("java/lang/ArithmeticException");
    assert(0);
} //orp_throw_java_lang_ArithmeticException



static uint64 orp_lshl(unsigned count, uint64 n)
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );
//yzw
#ifdef LONG_STA
	CriticalSectionHelper csp1(cs);
	static FILE *fp = NULL;
	if (!fp) fp = fopen("e:\\lshl_sta.txt", "w+");
	fprintf(fp, "%I64d\t%d\n", n, count & 0x3f);
	fflush(fp);
#endif
//yzw

    return n << (count & 0x3f);
} //orp_lshl


// The arguments are:
// edx:eax          - the value to be shifted
// ecx              - how many bits to shift by
// The result is returned in edx:eax.


void * getaddress__orp_lshl_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 13;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = push(ss, &edx_opnd);
    ss = push(ss, &eax_opnd);
    ss = push(ss, &ecx_opnd);
    ss = call(ss, (char *)orp_lshl);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(12));
    ss = ret(ss);
    
    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_lshl_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_lshl_naked


static int64 orp_lshr(unsigned count, int64 n)
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

#ifdef LONG_STA
	CriticalSectionHelper csp1(cs);
	static FILE *fp = NULL;
	if (!fp) fp = fopen("e:\\lshr_sta.txt", "w+");
	fprintf(fp, "%I64d\t%I64d\n", count, n);
	fflush(fp);
#endif

    return n >> (count & 0x3f);
} //orp_lshr


// The arguments are:
// edx:eax          - the value to be shifted
// ecx              - how many bits to shift by
// The result is returned in edx:eax.


void * getaddress__orp_lshr_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 13;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = push(ss, &edx_opnd);
    ss = push(ss, &eax_opnd);
    ss = push(ss, &ecx_opnd);
    ss = call(ss, (char *)orp_lshr);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(12));
    ss = ret(ss);
    
    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M:
    vtune_notify_stub_load_finished("getaddress__orp_lshr_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_lshr_naked


static uint64 orp_lushr(unsigned count, uint64 n)
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

#ifdef LONG_STA
	CriticalSectionHelper csp1(cs);
	static FILE *fp = NULL;
	if (!fp) fp = fopen("e:\\lushr_sta.txt", "w+");
	fprintf(fp, "%I64d\t%I64d\n", count, n);
	fflush(fp);
#endif

    return n >> (count & 0x3f);
} //orp_lushr


// The arguments are:
// edx:eax          - the value to be shifted
// ecx              - how many bits to shift by
// The result is returned in edx:eax.


void * getaddress__orp_lushr_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 13;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = push(ss, &edx_opnd);
    ss = push(ss, &eax_opnd);
    ss = push(ss, &ecx_opnd);
    ss = call(ss, (char *)orp_lushr);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(12));
    ss = ret(ss);
    
    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_lushr_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_lushr_naked


static int64 __stdcall orp_lmul(int64 m, int64 n) stdcall__;

static int64 __stdcall orp_lmul(int64 m, int64 n)
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

    return m * n;
} //orp_lmul

#ifdef ORP_LONG_OPT
static int64 __stdcall orp_lmul_const_multiplier(int64 m, int64 n) stdcall__;

static int64 __stdcall orp_lmul_const_multiplier(int64 m, int64 n)
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );
	__asm{
		mov  eax,dword ptr [ebp+0ch]
		mov  ecx,dword ptr [ebp+10h]
		mul   ecx 
		mov   ebx,eax
		mov   eax,dword ptr [ebp+08h]
		mul   ecx
		add   edx,ebx
	}
} //orp_lmul_const_multiplier
#endif


static int64 __stdcall orp_lrem(int64 m, int64 n) stdcall__;

static int64 __stdcall orp_lrem(int64 m, int64 n)
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

    return m % n;
} //orp_lrem


void * getaddress__orp_lrem_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 26;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = mov(ss, &eax_opnd, &M_Base_Opnd(esp_reg, +12) );
    ss = alu(ss, or_opc, &eax_opnd, &M_Base_Opnd(esp_reg, +16));

    ss = branch8(ss, cc_eq, &Imm_Opnd(0), 0);
    char *backpatch_address__divide_by_zero = ((char *)ss) - 1;
 
    ss = jump(ss, (char *)orp_lrem);

    signed offset = (signed)ss - (signed)backpatch_address__divide_by_zero - 1;
    *backpatch_address__divide_by_zero = offset;

    ss = gen_setup_j2n_frame(ss);

    ss = call(ss, (char *)orp_throw_java_lang_ArithmeticException);

    assert((ss - stub) < stub_size);
    addr = stub;
#ifdef ORP_VTUNE_SUPPORT
    //M: WangYong
    vtune_notify_stub_load_finished("getaddress__orp_lrem_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_lrem_naked


char *gen_setup_j2n_frame(char *s)
{
    s = call(s, (char *)getaddress__setup_java_to_native_frame() );
    return s;
} //setup_j2n_frame


char *gen_pop_j2n_frame(char *s)
{
    s = call(s, (char *)getaddress__pop_java_to_native_frame() );
    return s;
} //setup_j2n_frame


static int64 __stdcall orp_ldiv(int64 m, int64 n) stdcall__;

static int64 __stdcall orp_ldiv(int64 m, int64 n)
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );
    assert(n);
    return m / n;
} //orp_ldiv


static void *getaddress__orp_ldiv_naked()
{
    static void *addr = 0;
    if(addr) {
        return addr;
    }

    const int stub_size = 25;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0x90, stub_size);   // nop
#endif
    char *s = stub;
    s = mov(s, &eax_opnd, &M_Base_Opnd(esp_reg, 12));
    s = alu(s, or_opc, &eax_opnd, &M_Base_Opnd(esp_reg, 16));
    s = branch8(s, cc_eq, &Imm_Opnd(5), 1);  // skip 5 bytes over the next instruction
    s = jump32(s, &Imm_Opnd((((uint32)orp_ldiv) - ((uint32)s)) - 5));

    s = gen_setup_j2n_frame(s);

    s = call(s, (char *)orp_throw_java_lang_ArithmeticException);


    assert((s - stub) <= stub_size);
    addr = stub;
#ifdef ORP_VTUNE_SUPPORT
    //M:
    vtune_notify_stub_load_finished("getaddress__orp_ldiv_naked",(Byte*) stub,s-stub);
#endif
    return addr;
} //getaddress__orp_ldiv_naked

static int64 __stdcall orp_const_lrem(int64 m, unsigned magic_a) stdcall__;

static int64 __stdcall orp_const_lrem(int64 m, unsigned magic_a)
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

#if 0

	/***********************************************************************
	 * It's only used for 64/32 , and the quotient is smaller than 2^N     *
	 ***********************************************************************/
	__asm{
		////////// Test [magic_2+28]
		mov esi, [ebp+12] ;//x-hi
		mov edi, [ebp+16] ;//magic_a
		mov eax, [edi+40] ;//d_32
		cmp esi,eax ;
		jae slower ;

		/////////////////Get l
		mov ebx, [edi+48] ;//l (sh)

		//n2
		mov eax, [ebp+12] ; //x0 
		mov edx, [ebp+8] ;  //x1
		mov ecx, 0x20 ;
		sub ecx, ebx ;
		je l_is_32 ;//l ==32
		//We compute n2 and n10 together
		shld eax, edx, cl ;
		shl  edx, cl ; // eax: n2 , edx: n10
l_is_32:
		//n2 == n0 , n10 == n1

		//-n1
		mov ebx, edx ;
		sar ebx, 31 ;// ebx: XSIGN(n10) 

		//nadj
		mov esi, [edi+56] ;// d_sign(d_norm)
		and esi, ebx ;//AND(-n1,d_norm - 2N)
		add esi, edx ;

		//q1
		mov ecx, eax ;
		cmp eax, ebx ;
		je mul_zero ;
		sub eax, ebx ;
		mov ebx, eax ;
		mov edx, [edi+52] ;// edx: m'
		mul edx ;
		jmp next1 ;
mul_zero:
		xor edx,edx ;
		xor eax,eax ;
next1:
		add eax, esi ;
		adc edx, 0 ; //m'*(n2-(-n1))+nadj
		add ecx, edx ;
		mov esi, ecx ;//esi: q1=n2+HIGH(m'*(n2-(-n1))+nadj) 

		//dr
		mov eax, esi ;
		not eax ;//2N-1-q1
		mov ebx, [edi+40] ;//ebx: divisor
		mul ebx ;// edx:eax = (2N-1-q1)*d
		mov ecx, [ebp+12] ;//x0
		mov edi, [ebp+8] ;//x1, ecx:edi = n
		sub ecx, ebx ;//n-2N*d
		add eax, edi ;
		adc edx, ecx ;//edx:eax = dr

		//q
		mov edi,edx ;
		//mov ecx, esi ;
		//not ecx ;//2N-1-q1
		//sub edx, ecx ;//edx: q

		//r
		and edi,ebx ;//AND(d-2N,HIGH(dr)) ???
		add eax,edi ;//eax: r
		xor edx,edx ;

		jmp end ;

slower:
		push [edi+44] ;//y0
		push [edi+40] ;//y1
		push [ebp+12] ;//x0
		push [ebp+8] ; //x1
		call orp_lrem ;
end:
	}

//	int64* d = (int64*)(magic_a+40) ;

//	return m%(*d) ;
#endif

#if 1
	assert(0) ;
	return 0 ;
#endif
} //orp_lrem


void * getaddress__orp_const_lrem_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

	const int stub_size = 200 ;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
	char* ss = stub ;

    ss = push(ss, &ebp_opnd);
    ss = mov(ss, &ebp_opnd, &esp_opnd);
    ss = push(ss, &ebx_opnd);
    ss = push(ss, &esi_opnd);
    ss = push(ss, &edi_opnd);

    ss = mov(ss, &esi_opnd, &M_Base_Opnd(ebp_reg, +0x0c) );
    ss = mov(ss, &edi_opnd, &M_Base_Opnd(ebp_reg, +0x10) );
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(edi_reg, +0x28) );
    ss = alu(ss, cmp_opc, &esi_opnd, &eax_opnd);
    ss = branch8(ss, cc_ge, &Imm_Opnd(0x60), 0); // jae slower
    ss = mov(ss, &ebx_opnd, &M_Base_Opnd(edi_reg, +0x30) );
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(ebp_reg, +0x0c) );
    ss = mov(ss, &edx_opnd, &M_Base_Opnd(ebp_reg, +0x8) );
    ss = mov(ss, &ecx_opnd, &Imm_Opnd(0x20) );
    ss = alu(ss, sub_opc, &ecx_opnd, &ebx_opnd);
    ss = branch8(ss, cc_eq, &Imm_Opnd(0x5), 0); // je l_is_32
    ss = shift(ss, shld_opc, &eax_opnd, &edx_opnd); //shld eax,edx,cl
    ss = shift(ss, shl_opc, &edx_opnd); //shld eax,edx,cl
//l_is_32:
    ss = mov(ss, &ebx_opnd, &edx_opnd);
    ss = shift(ss, sar_opc, &ebx_opnd, &Imm_Opnd(0x1f)); //shld eax,edx,cl
    ss = mov(ss, &esi_opnd, &M_Base_Opnd(edi_reg, +0x38) );
    ss = alu(ss, and_opc, &esi_opnd, &ebx_opnd);
    ss = alu(ss, add_opc, &esi_opnd, &edx_opnd);
    ss = mov(ss, &ecx_opnd, &eax_opnd);
    ss = alu(ss, cmp_opc, &eax_opnd, &ebx_opnd);
    ss = branch8(ss, cc_eq, &Imm_Opnd(0x0b), 0); // je mul_zero
    ss = alu(ss, sub_opc, &eax_opnd, &ebx_opnd);
    ss = mov(ss, &ebx_opnd, &eax_opnd);
    ss = mov(ss, &edx_opnd, &M_Base_Opnd(edi_reg, +0x34) );
    ss = mul(ss, &edx_opnd,0);
    ss = jump8(ss, &Imm_Opnd(0x4)); // jmp next1
//mul_zero:
    ss = alu(ss, xor_opc, &edx_opnd, &edx_opnd);
    ss = alu(ss, xor_opc, &eax_opnd, &eax_opnd);
    ss = alu(ss, add_opc, &eax_opnd, &esi_opnd);
    ss = alu(ss, adc_opc, &edx_opnd, &Imm_Opnd(0));
    ss = alu(ss, add_opc, &ecx_opnd, &edx_opnd);
    ss = mov(ss, &esi_opnd, &ecx_opnd);
    ss = mov(ss, &eax_opnd, &esi_opnd);
    ss = _not(ss, &eax_opnd);
    ss = mov(ss, &ebx_opnd, &M_Base_Opnd(edi_reg, +0x28) );
    ss = mul(ss, &ebx_opnd,0);
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(ebp_reg, +0x0c) );
    ss = mov(ss, &edi_opnd, &M_Base_Opnd(ebp_reg, +0x8) );
    ss = alu(ss, sub_opc, &ecx_opnd, &ebx_opnd);
    ss = alu(ss, add_opc, &eax_opnd, &edi_opnd);
    ss = alu(ss, adc_opc, &edx_opnd, &ecx_opnd);
    ss = mov(ss, &edi_opnd, &edx_opnd);
    ss = alu(ss, and_opc, &edi_opnd, &ebx_opnd);
    ss = alu(ss, add_opc, &eax_opnd, &edi_opnd);
    ss = alu(ss, xor_opc, &edx_opnd, &edx_opnd);
    ss = jump8(ss,&Imm_Opnd(0x11)); // jmp end
//slower:
    ss = push(ss, &M_Base_Opnd(edi_reg, +0x2c) );
    ss = push(ss, &M_Base_Opnd(edi_reg, +0x28) );
    ss = push(ss, &M_Base_Opnd(ebp_reg, +0x0c) );
    ss = push(ss, &M_Base_Opnd(ebp_reg, +0x8) );
    ss = call(ss, (char *)orp_lrem);
//end:

    ss = pop(ss, &edi_opnd);
    ss = pop(ss, &esi_opnd);
    ss = pop(ss, &ebx_opnd);
    ss = mov(ss, &esp_opnd, &ebp_opnd);
    ss = pop(ss, &ebp_opnd);

    ss = ret(ss, &Imm_Opnd(0x0c));

#if 0
    const int stub_size = 26;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = jump32(ss, &Imm_Opnd((((uint32)orp_const_lrem) - ((uint32)ss)) - 5));

#endif

    assert((ss - stub) < stub_size);
    addr = stub;
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_const_lrem_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_const_lrem_naked

static int64 __stdcall orp_const_ldiv(int64 m, unsigned magic_a) stdcall__;

static int64 __stdcall orp_const_ldiv(int64 m, unsigned magic_a)
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

#if 0
	unsigned tmp0, tmp1, x_fan ;// for temopary storage.

	__asm{
		////////// Test [magic_2+28]
		mov esi, [ebp+12] ;//x0
		or esi,esi ;
		jne fast_64 ;
		mov edi, [ebp+16] ;
		mov ecx, [edi+28] ;
		or ecx, ecx ;
		je fast_64_2 ;

		/*****************************************************************
		 * Fast 32/32                                                    *
		 *****************************************************************/
		/////////////////Cal 32-bit MULUH(m',n)
		//mov ecx, [edi+28] ;//m'
		mov eax, [ebp+8] ;//x-lo
		mov esi,eax ;

		mul ecx ;//t1=edx:eax = m' * n
		sub esi, edx ;//n-t1
		mov ecx, [edi+12] ;//sh1
		shr esi, cl ;// (n-t1)>>sh1
		add edx, esi ;// t1+ ((n-t1)>>sh1)
		mov ecx, [edi+32] ;//sh2
		shr edx, cl ;

		mov eax, edx ;
		xor edx, edx ;//ret
		jmp end ;

fast_64:
		mov edi, [ebp+16] ;//magic_a
fast_64_2:
		mov eax, [edi+40] ;//d_32
		or esi, esi ;
		js fast_64_64 ; // x<0, goto 64/64
		cmp esi,eax ;
		jae fast_64_64 ; //64/64

		/*****************************************************************
		 * Fast 64/32                                                    *
		 *****************************************************************/
		/////////////////Get l
		mov ebx, [edi+48] ;//l (sh)

		//n2
		//mov eax, [ebp+12] ; //x0 
		mov eax, esi ; //x0 , has been loaded in esi.
		mov edx, [ebp+8] ;  //x1
		mov ecx, 0x20 ;
		sub ecx, ebx ;
		je l_is_32 ;//l ==32
		//We compute n2 and n10 together
		shld eax, edx, cl ;
		shl  edx, cl ; // eax: n2 , edx: n10
l_is_32:
		//n2 == n0 , n10 == n1

		//-n1
		mov ebx, edx ;
		sar ebx, 31 ;// ebx: XSIGN(n10) 

		//nadj
		mov esi, [edi+56] ;// d_sign(d_norm)
		and esi, ebx ;//AND(-n1,d_norm - 2N)
		add esi, edx ;

		//q1
		mov ecx, eax ;
		cmp eax, ebx ;
		je mul_zero ;
		sub eax, ebx ;
		mov ebx, eax ;
		mov edx, [edi+52] ;// edx: m'
		mul edx ;
		jmp next1 ;
mul_zero:
		xor edx,edx ;
		xor eax,eax ;
next1:
		add eax, esi ;
		adc edx, 0 ; //m'*(n2-(-n1))+nadj
		add ecx, edx ;
		mov esi, ecx ;//esi: q1=n2+HIGH(m'*(n2-(-n1))+nadj) 

		//dr
		mov eax, esi ;
		not eax ;//2N-1-q1
		mov ebx, [edi+40] ;//ebx: divisor
		mul ebx ;// edx:eax = (2N-1-q1)*d
		mov ecx, [ebp+12] ;//x0
		mov edi, [ebp+8] ;//x1, ecx:edi = n
		sub ecx, ebx ;//n-2N*d
		add eax, edi ;
		adc edx, ecx ;//edx:eax = dr

		//q
		mov ecx, esi ;
		not ecx ;//2N-1-q1
		mov eax, edx ;
		sub eax, ecx ;
		xor edx,edx ; //edx:eax: q

		jmp end ;

		/*****************************************************************
		 * Fast 64/64                                                    *
		 *****************************************************************/
		//////////Cal x_sign, x_fan, etc
//		mov esi, [ebp+12] ;
		//##
fast_64_64:
		or esi, esi ;
		jge positive ;// positive
		//##
		mov edx, esi ;
		mov ecx, esi ;
		sar ecx, 31 ;// x_fan = ecx:ecx
		shr esi, 31 ;//	x_sign = 0:esi
		mov x_fan, ecx ;

		/////////Cal xx
		mov eax, [ebp+8] ;
		xor eax, ecx ;
		xor edx, ecx ;
		add eax, esi ;
		adc edx, 0 ; // xx = ( x_fan^x)+x_sign = edx:eax
		mov tmp0, edx ;
		mov tmp1, eax ; // xx = tmp0:tmp1
		jmp Cal_t1 ;
positive:
		mov tmp0, esi ;
		mov eax, [ebp+8] ;
		mov tmp1, eax ; //xx = tmp0:tmp1
		xor esi,esi ;
		mov x_fan, esi ;
		
		////////Cal t1
Cal_t1:
		mov ecx, eax ; //ecx = eax = tmp1
		//mov edi, [ebp+16] ; //magic_a
		mov ebx, [edi] ;//m.l
		mul ebx ; // edx:eax = xx.l * m.l
		sub eax, 1 ;
		sbb edx, 1 ;// xx.l*m.l -1
		add eax, esi ;
		adc edx, 0 ; // LL = xx.l*m.l -(1-x_sign)
		mov edi, edx ; 
		mov eax, tmp0 ; 
		mul ebx ; // m.l * xx.h
		add eax, edi ;
		adc edx, 0 ;
		mov edi, edx ;
		mov esi, eax ;// LH = edi:esi = m.l * xx.h + HIGH(LL)
		mov eax, ecx ; //tmp1
		mov ebx, [ebp+16] ; //magic_a
		mov ebx, [ebx+4] ;//m.h
		mul ebx ;
		add eax, esi ;
		adc edx, edi ;
		mov edi, 0 ;
		adc edi, 0 ;
		mov esi, edx ;//HLLH = edi:esi =  HIGH(m.h * xx.l + LH)
		mov eax, tmp0
		mul ebx ;
		add eax, esi ;
		adc edx, edi ;// t1 = edx:eax

		///////////Cal q0
		mov edi, x_fan ;
//		test edi,edi ;
//		je b_1 ;
		mov ebx, edi ;
		not edi ; //~x_fan
		xor eax, edi ;
		xor edx, edi ;// ~x_fan ^ t1
//		b_1:
		mov edi, [ebp+8] ;
		mov esi, [ebp+12] ;// x = esi:edi
		add eax, edi ;					 
		adc edx, esi ; //q0	= edx:eax

		//////////Cal q0 = (q0 >> sh) - (INT64)x_fan, sh = 1a
		mov ecx, [ebp+16] ; //magic_a
		mov ecx, [ecx+16] ;//sh
		cmp ecx, 0x20 ;
		jge biger_than_32 ;
		// sh< 32
		shrd eax, edx, cl ;//0x1a ; //Be carefule if sh >=32 ! Just shift the up one!
		sar edx, cl ;//0x1a ;
		jmp next ;
		// sh>=32
biger_than_32:
		sub ecx,0x20 ;
		mov eax, edx ; //0x27
		sar edx, 31 ;
		sar eax, cl ; //7
		//
next:
		sub eax, ebx ;
		sbb edx, ebx ;	 //q0 = edx:eax

		//////////Cal 	INT64 q = (q0 ^ d_sign) - d_sign ; d_sign = 0 ;
		mov ecx, [ebp+16] ;
		mov ecx, [ecx+20] ;
		test ecx,ecx ;
		je end ;
		xor eax, ecx ;
		xor edx, ecx ;
		sub eax, ecx ;
		sbb edx, ecx ; //q = edx:eax !
end:
 	}
#endif

#if 0
	__asm{
		////////// Test [magic_2+28]
		mov esi, [ebp+12] ;//x-hi
		or esi,esi ;
		jne normal2 ;
		mov edi, [ebp+16] ;
		mov ecx, [edi+28] ;
		or ecx,ecx ;
		je normal1 ;

		/////////////////Cal 32-bit MULUH(m',n)
		//mov ecx, [edi+28] ;//m'
		mov eax, [ebp+8] ;//x-lo
		mov esi,eax ;

		mul ecx ;//t1=edx:eax = m' * n
		sub esi, edx ;//n-t1
		mov ecx, [edi+12] ;//sh1
		shr esi, cl ;// (n-t1)>>sh1
		add edx, esi ;// t1+ ((n-t1)>>sh1)
		mov ecx, [edi+32] ;//sh2
		shr edx, cl ;

		mov eax, edx ;
		xor edx, edx ;//ret
		jmp end ;

		//////////Cal x_sign, x_fan, etc
//normal:
		//mov esi, [ebp+12] ;
		//##
normal1:
		or esi, esi ;
normal2:
		jge positive ;// positive
		//##
		mov edx, esi ;
		mov ecx, esi ;
		sar ecx, 31 ;// x_fan = ecx:ecx
		shr esi, 31 ;//	x_sign = 0:esi
		mov x_fan, ecx ;

		/////////Cal xx
		mov eax, [ebp+8] ;
		xor eax, ecx ;
		xor edx, ecx ;
		add eax, esi ;
		adc edx, 0 ; // xx = ( x_fan^x)+x_sign = edx:eax
		mov tmp0, edx ;
		mov tmp1, eax ; // xx = tmp0:tmp1
		jmp Cal_t1 ;
positive:
		mov tmp0, esi ;
		mov eax, [ebp+8] ;
		mov tmp1, eax ; //xx = tmp0:tmp1
		xor esi,esi ;
		mov x_fan, esi ;
		
		////////Cal t1
Cal_t1:
		mov ecx, eax ; //ecx = eax = tmp1
		//mov ebx, 0x010E222b ;//0xa209d0f6 ; //m.l
		mov edi, [ebp+16] ; //magic_a
		mov ebx, [edi] ;//m.l
		mul ebx ; // edx:eax = xx.l * m.l
		sub eax, 1 ;
		sbb edx, 1 ;// xx.l*m.l -1
		add eax, esi ;
		adc edx, 0 ; // LL = xx.l*m.l -(1-x_sign)
		mov edi, edx ; 
		//mov eax, [ebp+12] ;
		mov eax, tmp0 ; 
		mul ebx ; // m.l * xx.h
		add eax, edi ;
		adc edx, 0 ;
		mov edi, edx ;
		mov esi, eax ;// LH = edi:esi = m.l * xx.h + HIGH(LL)
		mov eax, ecx ; //tmp1
		//mov ebx,  0x733d858b ;//0x3928af3a ; //m.h
		mov ebx, [ebp+16] ; //magic_a
		mov ebx, [ebx+4] ;//m.h
		mul ebx ;
		add eax, esi ;
		adc edx, edi ;
		mov edi, 0 ;
		adc edi, 0 ;
		mov esi, edx ;//HLLH = edi:esi =  HIGH(m.h * xx.l + LH)
		//mov ebx, [ebp+16] ; //magic_a
		//mov ebx, [ebx+4] ;//m.h ,still in ebx?
		mov eax, tmp0
		mul ebx ;
		add eax, esi ;
		adc edx, edi ;// t1 = edx:eax

		///////////Cal q0
		mov edi, x_fan ;
//		test edi,edi ;
//		je b_1 ;
		mov ebx, edi ;
		not edi ; //~x_fan
		xor eax, edi ;
		xor edx, edi ;// ~x_fan ^ t1
//		b_1:
		mov edi, [ebp+8] ;
		mov esi, [ebp+12] ;// x = esi:edi
		add eax, edi ;					 
		adc edx, esi ; //q0	= edx:eax

		//////////Cal q0 = (q0 >> sh) - (INT64)x_fan, sh = 1a
		mov ecx, [ebp+16] ; //magic_a
		mov ecx, [ecx+16] ;//sh
		cmp ecx, 0x20 ;
		jge biger_than_32 ;
		// sh< 32
		shrd eax, edx, cl ;//0x1a ; //Be carefule if sh >=32 ! Just shift the up one!
		sar edx, cl ;//0x1a ;
		jmp next ;
		// sh>=32
biger_than_32:
		sub ecx,0x20 ;
		mov eax, edx ; //0x27
		sar edx, 31 ;
		sar eax, cl ; //7
		//
next:
		sub eax, ebx ;
		sbb edx, ebx ;	 //q0 = edx:eax

		//////////Cal 	INT64 q = (q0 ^ d_sign) - d_sign ; d_sign = 0 ;
		mov ecx, [ebp+16] ;
		mov ecx, [ecx+20] ;
		test ecx,ecx ;
		je end ;
		xor eax, ecx ;
		xor edx, ecx ;
		sub eax, ecx ;
		sbb edx, ecx ; //q = edx:eax !
end:

	}
#endif

#if 1
	assert(0) ;
	return 0 ;
#endif
} //orp_ldiv

static void *getaddress__orp_const_ldiv_naked()
{
    static void *addr = 0;
    if(addr) {
        return addr;
    }

/******************************************************************************
 * fast long division in this function
 ******************************************************************************/
    const int stub_size = 400;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
	char* ss = stub ;

    ss = push(ss, &ebp_opnd);
    ss = mov(ss, &ebp_opnd, &esp_opnd);
	ss = alu(ss, sub_opc, &esp_opnd, &Imm_Opnd(0x4c)) ;
    ss = push(ss, &ebx_opnd);
    ss = push(ss, &esi_opnd);
    ss = push(ss, &edi_opnd);

    ss = mov(ss, &esi_opnd, &M_Base_Opnd(ebp_reg, +0x0c) );
	ss = alu(ss, or_opc, &esi_opnd, &esi_opnd) ;
    ss = branch8(ss, cc_ne, &Imm_Opnd(0x28), 0); // jne fast_64
    ss = mov(ss, &edi_opnd, &M_Base_Opnd(ebp_reg, +0x10) );
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(edi_reg, +0x1c) );
	ss = alu(ss, or_opc, &ecx_opnd, &ecx_opnd) ;
    ss = branch8(ss, cc_eq, &Imm_Opnd(0x21), 0); // jne fast_64_2
	/*******************************************************************************
	 * Fast 32/32
	 *******************************************************************************/
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(ebp_reg, +0x08) );
    ss = mov(ss, &esi_opnd, &eax_opnd );
	ss = mul(ss, &ecx_opnd, 0) ;
	ss = alu(ss, sub_opc, &esi_opnd, &edx_opnd) ;
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(edi_reg, +0x0c) );
	ss = shift(ss, shr_opc, &esi_opnd) ;
	ss = alu(ss, add_opc, &edx_opnd, &esi_opnd) ;
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(edi_reg, +0x20) );
	ss = shift(ss, shr_opc, &edx_opnd) ;
    ss = mov(ss, &eax_opnd, &edx_opnd );
	ss = alu(ss, xor_opc, &edx_opnd, &edx_opnd) ;
    ss = jump32(ss,&Imm_Opnd(0x130)); // jmp end
//fast_64:
    ss = mov(ss, &edi_opnd, &M_Base_Opnd(ebp_reg, +0x10) );
//fast_64_2:
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(edi_reg, +0x28) );
	ss = alu(ss, or_opc, &esi_opnd, &esi_opnd) ;
    ss = branch8(ss, cc_lt, &Imm_Opnd(0x68), 1); // js fast_64_64
	ss = alu(ss, cmp_opc, &esi_opnd, &eax_opnd) ;
    ss = branch8(ss, cc_ge, &Imm_Opnd(0x64), 0); // jae fast_64_64
	/*******************************************************************************
	 * Fast 64/32
	 *******************************************************************************/
    ss = mov(ss, &ebx_opnd, &M_Base_Opnd(edi_reg, +0x30) );
    ss = mov(ss, &eax_opnd, &esi_opnd );
    ss = mov(ss, &edx_opnd, &M_Base_Opnd(ebp_reg, +0x8) );
    ss = mov(ss, &ecx_opnd, &Imm_Opnd(0x20) );
	ss = alu(ss, sub_opc, &ecx_opnd, &ebx_opnd) ;
    ss = branch8(ss, cc_eq, &Imm_Opnd(0x5), 0); // je l_is_32
	ss = shift(ss, shld_opc, &eax_opnd, &edx_opnd) ;
	ss = shift(ss, shl_opc, &edx_opnd) ;
//l_is_32:
    ss = mov(ss, &ebx_opnd, &edx_opnd );
	ss = shift(ss, sar_opc, &ebx_opnd, &Imm_Opnd(0x1f)) ;
    ss = mov(ss, &esi_opnd, &M_Base_Opnd(edi_reg, +0x38) );
	ss = alu(ss, and_opc, &esi_opnd, &ebx_opnd) ;
	ss = alu(ss, add_opc, &esi_opnd, &edx_opnd) ;
    ss = mov(ss, &ecx_opnd, &eax_opnd );
	ss = alu(ss, cmp_opc, &eax_opnd, &ebx_opnd) ;
    ss = branch8(ss, cc_eq, &Imm_Opnd(0xb), 0); // je mul_zero
	ss = alu(ss, sub_opc, &eax_opnd, &ebx_opnd) ;
    ss = mov(ss, &ebx_opnd, &eax_opnd );
    ss = mov(ss, &edx_opnd, &M_Base_Opnd(edi_reg, +0x34) );
	ss = mul(ss, &edx_opnd, 0) ;
    ss = jump8(ss,&Imm_Opnd(0x4)); // jmp next1
//mul_zero:
	ss = alu(ss, xor_opc, &edx_opnd, &edx_opnd) ;
	ss = alu(ss, xor_opc, &eax_opnd, &eax_opnd) ;
//next1:
	ss = alu(ss, add_opc, &eax_opnd, &esi_opnd) ;
	ss = alu(ss, adc_opc, &edx_opnd, &Imm_Opnd(0)) ;
	ss = alu(ss, add_opc, &ecx_opnd, &edx_opnd) ;
    ss = mov(ss, &esi_opnd, &ecx_opnd );
    ss = mov(ss, &eax_opnd, &esi_opnd );
	ss = _not(ss, &eax_opnd) ;
    ss = mov(ss, &ebx_opnd, &M_Base_Opnd(edi_reg, +0x28) );
	ss = mul(ss, &ebx_opnd, 0) ;
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(ebp_reg, +0x0c) );
    ss = mov(ss, &edi_opnd, &M_Base_Opnd(ebp_reg, +0x8) );
	ss = alu(ss, sub_opc, &ecx_opnd, &ebx_opnd) ;
	ss = alu(ss, add_opc, &eax_opnd, &edi_opnd) ;
	ss = alu(ss, adc_opc, &edx_opnd, &ecx_opnd) ;
    ss = mov(ss, &ecx_opnd, &esi_opnd );
	ss = _not(ss, &ecx_opnd) ;
    ss = mov(ss, &eax_opnd, &edx_opnd );
	ss = alu(ss, sub_opc, &eax_opnd, &ecx_opnd) ;
	ss = alu(ss, xor_opc, &edx_opnd, &edx_opnd) ;
    ss = jump32(ss,&Imm_Opnd(0xbe)); // jmp end
	/*******************************************************************************
	 * Fast 64/64
	 *******************************************************************************/
//fast_64_64
	ss = alu(ss, or_opc, &esi_opnd, &esi_opnd) ;
    ss = branch8(ss, cc_ge, &Imm_Opnd(0x21), 1); // jge positive
    ss = mov(ss, &edx_opnd, &esi_opnd );
    ss = mov(ss, &ecx_opnd, &esi_opnd );
	ss = shift(ss, sar_opc, &ecx_opnd, &Imm_Opnd(0x1f)) ;
	ss = shift(ss, shr_opc, &esi_opnd, &Imm_Opnd(0x1f)) ;
    ss = mov(ss, &M_Base_Opnd(ebp_reg, -0x0c), &ecx_opnd );
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(ebp_reg, +0x8) );
	ss = alu(ss, xor_opc, &eax_opnd, &ecx_opnd) ;
	ss = alu(ss, xor_opc, &edx_opnd, &ecx_opnd) ;
	ss = alu(ss, add_opc, &eax_opnd, &esi_opnd) ;
	ss = alu(ss, adc_opc, &edx_opnd, &Imm_Opnd(0) ) ;
    ss = mov(ss, &M_Base_Opnd(ebp_reg, -0x04), &edx_opnd );
    ss = mov(ss, &M_Base_Opnd(ebp_reg, -0x08), &eax_opnd );
    ss = jump8(ss,&Imm_Opnd(0x0e)); // jmp Cal_t1
//positive:
    ss = mov(ss, &M_Base_Opnd(ebp_reg, -0x04), &esi_opnd );
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(ebp_reg, +0x8) );
    ss = mov(ss, &M_Base_Opnd(ebp_reg, -0x08), &eax_opnd );
	ss = alu(ss, xor_opc, &esi_opnd, &esi_opnd) ;
    ss = mov(ss, &M_Base_Opnd(ebp_reg, -0x0c), &esi_opnd );
//Cal_t1:
    ss = mov(ss, &ecx_opnd, &eax_opnd );
    ss = mov(ss, &ebx_opnd, &M_Base_Opnd(edi_reg, +0x0) );
	ss = mul(ss, &ebx_opnd, 0) ;
	ss = alu(ss, sub_opc, &eax_opnd, &Imm_Opnd(0x1) ) ;
	ss = alu(ss, sbb_opc, &edx_opnd, &Imm_Opnd(0x1) ) ;
	ss = alu(ss, add_opc, &eax_opnd, &esi_opnd ) ;
	ss = alu(ss, adc_opc, &edx_opnd, &Imm_Opnd(0) ) ;
    ss = mov(ss, &edi_opnd, &edx_opnd );
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(ebp_reg, -0x4) );
	ss = mul(ss, &ebx_opnd, 0) ;
	ss = alu(ss, add_opc, &eax_opnd, &edi_opnd ) ;
	ss = alu(ss, adc_opc, &edx_opnd, &Imm_Opnd(0) ) ;
    ss = mov(ss, &edi_opnd, &edx_opnd );
    ss = mov(ss, &esi_opnd, &eax_opnd );
    ss = mov(ss, &eax_opnd, &ecx_opnd );
    ss = mov(ss, &ebx_opnd, &M_Base_Opnd(ebp_reg, +0x10) );
    ss = mov(ss, &ebx_opnd, &M_Base_Opnd(ebx_reg, +0x04) );
	ss = mul(ss, &ebx_opnd, 0) ;
	ss = alu(ss, add_opc, &eax_opnd, &esi_opnd ) ;
	ss = alu(ss, adc_opc, &edx_opnd, &edi_opnd ) ;
    ss = mov(ss, &edi_opnd, &Imm_Opnd(0) );
	ss = alu(ss, adc_opc, &edi_opnd, &Imm_Opnd(0) ) ;
    ss = mov(ss, &esi_opnd, &edx_opnd );
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(ebp_reg, -0x4) );
	ss = mul(ss, &ebx_opnd, 0) ;
	ss = alu(ss, add_opc, &eax_opnd, &esi_opnd ) ;
	ss = alu(ss, adc_opc, &edx_opnd, &edi_opnd ) ;
    ss = mov(ss, &edi_opnd, &M_Base_Opnd(ebp_reg, -0x0c) );
    ss = mov(ss, &ebx_opnd, &edi_opnd );
	ss = _not(ss, &edi_opnd) ;
	ss = alu(ss, xor_opc, &eax_opnd, &edi_opnd) ;
	ss = alu(ss, xor_opc, &edx_opnd, &edi_opnd) ;
    ss = mov(ss, &edi_opnd, &M_Base_Opnd(ebp_reg, +0x08) );
    ss = mov(ss, &esi_opnd, &M_Base_Opnd(ebp_reg, +0x0c) );
	ss = alu(ss, add_opc, &eax_opnd, &edi_opnd ) ;
	ss = alu(ss, adc_opc, &edx_opnd, &esi_opnd ) ;
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(ebp_reg, +0x10) );
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(ecx_reg, +0x10) );
	ss = alu(ss, cmp_opc, &ecx_opnd, &Imm_Opnd(0x20) ) ;
    ss = branch8(ss, cc_ge, &Imm_Opnd(0x07), 1); // jge biger_than_32
	ss = shift(ss, shrd_opc, &eax_opnd, &edx_opnd ) ;
	ss = shift(ss, sar_opc, &edx_opnd ) ;
    ss = jump8(ss,&Imm_Opnd(0x0a)); // jmp next
//biger_than_32:
	ss = alu(ss, sub_opc, &ecx_opnd, &Imm_Opnd(0x20) ) ;
    ss = mov(ss, &eax_opnd, &edx_opnd );
	ss = shift(ss, sar_opc, &edx_opnd, &Imm_Opnd(0x1f) ) ;
	ss = shift(ss, sar_opc, &eax_opnd ) ;
//next:
	ss = alu(ss, sub_opc, &eax_opnd, &ebx_opnd ) ;
	ss = alu(ss, sbb_opc, &edx_opnd, &ebx_opnd ) ;
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(ebp_reg, +0x10) );
    ss = mov(ss, &ecx_opnd, &M_Base_Opnd(ecx_reg, +0x14) );
	ss = test(ss, &ecx_opnd, &ecx_opnd) ;
    ss = branch8(ss, cc_eq, &Imm_Opnd(0x08), 0); // je end
	ss = alu(ss, xor_opc, &eax_opnd, &ecx_opnd) ;
	ss = alu(ss, xor_opc, &edx_opnd, &ecx_opnd) ;
	ss = alu(ss, sub_opc, &eax_opnd, &ecx_opnd) ;
	ss = alu(ss, sbb_opc, &edx_opnd, &ecx_opnd) ;
	
//end:
    ss = pop(ss, &edi_opnd);
    ss = pop(ss, &esi_opnd);
    ss = pop(ss, &ebx_opnd);
    ss = mov(ss, &esp_opnd, &ebp_opnd);
    ss = pop(ss, &ebp_opnd);

    ss = ret(ss, &Imm_Opnd(0x0c));

#if 0
/******************************************************************************
 * fast long division in function orp_const_ldiv
 ******************************************************************************/
    const int stub_size = 25;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0x90, stub_size);   // nop
#endif
    char *ss = stub;
    ss = jump32(ss, &Imm_Opnd((((uint32)orp_const_ldiv) - ((uint32)ss)) - 5));

#endif

    assert((ss - stub) <= stub_size);
    addr = stub;
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_const_ldiv_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_ldiv_naked

static double orp_rt_ddiv(double a, double b)
{
    double result = a / b;
    return result;
} //orp_rt_ddiv


void *getaddress__orp_d2i()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 43;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;    

    ss = push(ss, &ebp_opnd);
    ss = mov(ss, &ebp_opnd, &esp_opnd);
    ss = push(ss, &ebx_opnd);
    ss = push(ss, &esi_opnd);
    ss = push(ss, &edi_opnd);


    ss = fld(ss, &M_Base_Opnd(ebp_reg, +0x10), 1);
    ss = wait(ss);
    ss = fnstcw(ss, &M_Base_Opnd(ebp_reg, +8) );
    ss = mov(ss, &ax_opnd, &M_Base_Opnd(ebp_reg, +8), opnd_16);
    ss = alu(ss, or_opc, &ax_opnd, &Imm_Opnd(0xc7f));
    ss = mov(ss, &M_Base_Opnd(ebp_reg, +0x0c), &ax_opnd, opnd_16);

    ss = fldcw(ss, &M_Base_Opnd(ebp_reg, +0x0c) );
    ss = fist_pop(ss, &M_Base_Opnd(ebp_reg, +0x0c), 0);
    ss = fldcw(ss, &M_Base_Opnd(ebp_reg, +8) );
    
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(ebp_reg, +0x0c) );

    ss = pop(ss, &edi_opnd);
    ss = pop(ss, &esi_opnd);
    ss = pop(ss, &ebx_opnd);
    ss = pop(ss, &ebp_opnd);

    ss = ret(ss);

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_d2i",(Byte*) stub,ss-stub);
#endif
    return addr;    
} //getaddress__orp_d2i


#ifdef  ORP_POSIX

//extern int  _isnan(double dd);
#ifndef _isnan
#define _isnan isnan
#endif

#endif

static int32 __stdcall orp_d2i(double d) stdcall__;

static int32 __stdcall orp_d2i(double d)
{
    assert( !orp_is_gc_enabled(p_TLS_orpthread) );

#ifdef ORP_STATS
    orp_stats_total.num_d2i++;
#endif

    int32 result;

    int32 (*gad2i)(int, int, double);
    gad2i = (int32 ( *)(int, int, double) )getaddress__orp_d2i();

    //result = asm__orp_d2i(0, 0, d);

    result = gad2i(0, 0, d);


#pragma warning(disable: 4146)
    // 0x80000000 is the integer indefinite value
    if(0x80000000 == (uint32)result) {
        if(_isnan(d)) {
            return 0;
        } else if(d > (double)2147483647) {
            return 2147483647;      // maxint
        } else if(d < (double)-2147483648) {
            return -2147483648;     // minint
        } else {
            // The above should exhaust all possibilities
            assert(0);
            return result;
        }
    } else {
        return result;
    }
#pragma warning(default: 4146)
} //orp_d2i


void *getaddress__orp_d2l()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 100;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;    

    ss = push(ss, &ebp_opnd);
    ss = mov(ss, &ebp_opnd, &esp_opnd);
    ss = push(ss, &ebx_opnd);
    ss = push(ss, &esi_opnd);
    ss = push(ss, &edi_opnd);


    ss = fld(ss, &M_Base_Opnd(ebp_reg, +0x14), 1);
    ss = wait(ss);
    ss = fnstcw(ss, &M_Base_Opnd(ebp_reg, +8) );
    ss = mov(ss, &ax_opnd, &M_Base_Opnd(ebp_reg, +8), opnd_16);
    ss = alu(ss, or_opc, &ax_opnd, &Imm_Opnd(0xc7f));
    ss = mov(ss, &M_Base_Opnd(ebp_reg, +0x0c), &ax_opnd, opnd_16);

    ss = fldcw(ss, &M_Base_Opnd(ebp_reg, +0x0c) );
    ss = fist_pop(ss, &M_Base_Opnd(ebp_reg, +0x0c), 1);
    ss = fldcw(ss, &M_Base_Opnd(ebp_reg, +8) );
    
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(ebp_reg, +0x0c) );
    ss = mov(ss, &edx_opnd, &M_Base_Opnd(ebp_reg, +0x10) );

    ss = pop(ss, &edi_opnd);
    ss = pop(ss, &esi_opnd);
    ss = pop(ss, &ebx_opnd);
    ss = pop(ss, &ebp_opnd);

    ss = ret(ss);

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_d2l",(Byte*) stub,ss-stub);
#endif
   return addr;       
} //getaddress__orp_d2l


static int64 __stdcall orp_d2l(double d) stdcall__;

static int64 __stdcall orp_d2l(double d)
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

#ifdef ORP_STATS
    orp_stats_total.num_d2l++;
#endif

    int64 result;

    int64 (*gad2l)(int, int, int, double);
    gad2l = (int64 ( *)(int, int, int, double) )getaddress__orp_d2l();

    result = gad2l(0, 0, 0, d);

#pragma warning(disable: 4146)

    // 0x80000000 is the integer indefinite value
    if(0x80000000 == *(uint32*)((char*)&result+4)) {

#ifdef ORP_POSIX
        if (isnan(d))
            return 0;
#else
        if (_isnan(d))
            return 0;
#endif 

        if(d >= (double)(__INT64_C(0x7fffffffffffffff))) {
            return __INT64_C(0x7fffffffffffffff);      // maxint
        } else if(d < (double)-__INT64_C(0x8000000000000000)) {
            return -__INT64_C(0x8000000000000000);     // minint
        } else {
            // The above should exhaust all possibilities
            assert(0);
            return result;
        }

    } else {
        return result;
    }

#pragma warning(default: 4146)

} //orp_d2l


void *getaddress__orp_f2i()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 43;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;    

    ss = push(ss, &ebp_opnd);
    ss = mov(ss, &ebp_opnd, &esp_opnd);
    ss = push(ss, &ebx_opnd);
    ss = push(ss, &esi_opnd);
    ss = push(ss, &edi_opnd);


    ss = fld(ss, &M_Base_Opnd(ebp_reg, +0x10), 0);
    ss = wait(ss);
    ss = fnstcw(ss, &M_Base_Opnd(ebp_reg, +8) );
    ss = mov(ss, &ax_opnd, &M_Base_Opnd(ebp_reg, +8), opnd_16);
    ss = alu(ss, or_opc, &ax_opnd, &Imm_Opnd(0xc7f));
    ss = mov(ss, &M_Base_Opnd(ebp_reg, +0x0c), &ax_opnd, opnd_16);

    ss = fldcw(ss, &M_Base_Opnd(ebp_reg, +0x0c) );
    ss = fist_pop(ss, &M_Base_Opnd(ebp_reg, +0x0c), 0);
    ss = fldcw(ss, &M_Base_Opnd(ebp_reg, +8) );
    
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(ebp_reg, +0x0c) );

    ss = pop(ss, &edi_opnd);
    ss = pop(ss, &esi_opnd);
    ss = pop(ss, &ebx_opnd);
    ss = pop(ss, &ebp_opnd);

    ss = ret(ss);

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_f2i",(Byte*) stub,ss-stub);
#endif
    return addr;    
} //getaddress__orp_f2i

static int32 __stdcall orp_f2i(float f) stdcall__;

static int32 __stdcall orp_f2i(float f)
{
    int32 result;
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

#ifdef ORP_STATS
    orp_stats_total.num_f2i++;
#endif

    int32 (*gaf2i)(int, int, float);
    gaf2i = (int32 ( *)(int, int, float) )getaddress__orp_f2i();

    //result = asm__orp_f2i(0, 0, f);

    result = gaf2i(0, 0, f);


#pragma warning(disable: 4146)
    // 0x80000000 is the integer indefinite value
    if(0x80000000 == (uint32)result) {
        if(_isnan(f)) {
            return 0;
        } else if(f > (double)2147483647) {
            return 2147483647;      // maxint
        } else if(f < (double)-2147483648) {
            return -2147483648;     // minint
        } else {
            // The above should exhaust all possibilities
            assert(0);
            return result;
        }
    } else {
        return result;
    }
#pragma warning(default: 4146)

} //orp_f2i


void *getaddress__orp_f2l()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 100;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;    

    ss = push(ss, &ebp_opnd);
    ss = mov(ss, &ebp_opnd, &esp_opnd);
    ss = push(ss, &ebx_opnd);
    ss = push(ss, &esi_opnd);
    ss = push(ss, &edi_opnd);

    ss = fld(ss, &M_Base_Opnd(ebp_reg, +0x14), 0);
    ss = wait(ss);
    ss = fnstcw(ss, &M_Base_Opnd(ebp_reg, +8) );
    ss = mov(ss, &ax_opnd, &M_Base_Opnd(ebp_reg, +8), opnd_16);
    ss = alu(ss, or_opc, &ax_opnd, &Imm_Opnd(0xc7f));
    ss = mov(ss, &M_Base_Opnd(ebp_reg, +0x0c), &ax_opnd, opnd_16);

    ss = fldcw(ss, &M_Base_Opnd(ebp_reg, +0x0c) );
    ss = fist_pop(ss, &M_Base_Opnd(ebp_reg, +0x0c), 1);
    ss = fldcw(ss, &M_Base_Opnd(ebp_reg, +8) );
    
    ss = mov(ss, &eax_opnd, &M_Base_Opnd(ebp_reg, +0x0c) );
    ss = mov(ss, &edx_opnd, &M_Base_Opnd(ebp_reg, +0x10) );

    ss = pop(ss, &edi_opnd);
    ss = pop(ss, &esi_opnd);
    ss = pop(ss, &ebx_opnd);
    ss = pop(ss, &ebp_opnd);

    ss = ret(ss);

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: WangYong
    vtune_notify_stub_load_finished("getaddress__orp_f2l",(Byte*) stub,ss-stub);
#endif
    return addr;    
} //getaddress__orp_f2l


static int64 __stdcall orp_f2l(float f) stdcall__;

static int64 __stdcall orp_f2l(float f)
{
    assert(  !orp_is_gc_enabled(p_TLS_orpthread) );

#ifdef ORP_STATS
    orp_stats_total.num_f2l++;
#endif

    int64 result;

    int64 (*gaf2l)(int, int, int, float);
    gaf2l = (int64 ( *)(int, int, int, float) )getaddress__orp_f2l();

    //result = asm__orp_f2l(0, 0, 0, f);

    result = gaf2l(0, 0, 0, f);

#pragma warning(disable: 4146)
    // 0x80000000 is the integer indefinite value
    if(0x80000000 == *(uint32*)((char*)&result+4)) {
        if(_isnan(f)) {
            return 0;
        } else if(f >= __INT64_C(0x7fffffffffffffff) ) {
            return __INT64_C(0x7fffffffffffffff);      // maxint
        } else if(f < (double)__INT64_C(-0x8000000000000000) ) {
            return __INT64_C(-0x8000000000000000);     // minint
        } else {
            // The above should exhaust all possibilities
            assert(0);
            return result;
        }
    } else {
        return result;
    }
#pragma warning(default: 4146)

} //orp_f2l


//
// If fprem succeeds in producing a remainder that is less than the
// modulus, the function is complete and the C2 flag is cleared.
// Otherwise, C2 is set, and the result on the top of the fp stack
// is the partial remainder.  We need to re-execute the fprem instruction
// (using the partial remainder) until C2 is cleared.
//


void *getaddress__orp_frem()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 24;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;    

    ss = fld(ss, &M_Base_Opnd(esp_reg, 4), 0);
    ss = fld(ss, &M_Base_Opnd(esp_reg, 8), 0);

//rem_not_complete:
    int rem_not_complete = (int)ss;

    ss = math_fprem(ss);
    ss = fnstsw(ss); 
    ss = alu(ss, and_opc, &ax_opnd, &Imm_Opnd(0x400));

    int offset = rem_not_complete - (int)ss - 2;
    ss = branch8(ss, cc_ne, &Imm_Opnd(offset), 0); // jne rem_not_complete

    ss = fstp(ss, 1);

    ss = ret(ss,&Imm_Opnd(8));

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M:
    vtune_notify_stub_load_finished("getaddress__orp_frem",(Byte*) stub,ss-stub);
#endif
    return addr;    
} //getaddress__orp_frem



void *getaddress__orp_drem()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 22;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;    

    ss = fld(ss, &M_Base_Opnd(esp_reg, 4), 1);
    ss = fld(ss, &M_Base_Opnd(esp_reg, 12), 1);

//rem_not_complete:
    int rem_not_complete = (int)ss;

    ss = math_fprem(ss);
    ss = fnstsw(ss); 
    ss = alu(ss, and_opc, &ax_opnd, &Imm_Opnd(0x400));

    int offset = rem_not_complete - (int)ss - 2;
    ss = branch8(ss, cc_ne, &Imm_Opnd(offset), 0); // jne rem_not_complete

    ss = ret(ss,&Imm_Opnd(0x10));

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_drem",(Byte*) stub,ss-stub);
#endif
    return addr;    
} //getaddress__orp_drem




/////////////////////////////////////////////////////////////////
// end ORP_Runtime_Support
/////////////////////////////////////////////////////////////////


static void
orp_throw_linking_exception(unsigned cp_index,
                            Class *clss,
                            Loader_Exception ld_exc)
{
    printf("orp_throw_linking_exception, idx=%d\n", cp_index);
    const char *error_class_name;
    switch(ld_exc) {
    case LD_NoClassDefFoundError:
        error_class_name = "java/lang/NoClassDefFoundError";
        break;
    default:
        error_class_name = "java/lang/VirtualMachineError";
        assert(0);
        break;
    }
    throw_java_exception(error_class_name);
    assert(0);
} //orp_throw_linking_exception


void * getaddress__orp_throw_linking_exception_naked()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 100;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = gen_setup_j2n_frame(ss);
    ss = push(ss, &M_Base_Opnd(esp_reg, (sizeof(J2N_Saved_State)+8) ) );
    ss = push(ss, &M_Base_Opnd(esp_reg, (sizeof(J2N_Saved_State)+8) ) );
    ss = push(ss, &M_Base_Opnd(esp_reg, (sizeof(J2N_Saved_State)+8) ) );
    ss = call(ss, (char *)orp_throw_linking_exception);
    
    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__orp_throw_linking_exception_naked",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__orp_throw_linking_exception_naked



void * getaddress__jvmdi_method_entry_event()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 1024;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = gen_setup_j2n_frame(ss);
    ss = call(ss, (char *)jvmdi_method_entry);
    ss = gen_pop_j2n_frame(ss);
    ss = ret(ss);
    
    addr = stub;
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__jvmdi_method_entry_event",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__jvmdi_method_entry_event


void * getaddress__jvmdi_method_exit_event()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 20;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;


    ss = gen_setup_j2n_frame(ss);
    ss = call(ss, (char *)jvmdi_method_exit);
    ss = gen_pop_j2n_frame(ss);
    ss = ret(ss);
    
    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__jvmdi_method_exit_event",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__jvmdi_method_exit_event

void * getaddress__gc_write_barrier_fastcall()
{
    static void *addr = 0;
    if (addr) {
        return addr;
    }

    const int stub_size = 11;
    char *stub = (char *)gc_malloc_fixed_code_for_class_loading(stub_size);
#ifdef _DEBUG
    memset(stub, 0xcc /*int 3*/, stub_size);
#endif
    char *ss = stub;

    ss = push(ss, &ecx_opnd);

    ss = call(ss, (char *)gc_write_barrier);
    ss = alu(ss, add_opc, &esp_opnd, &Imm_Opnd(4));

    ss = ret(ss);

    addr = stub;
    assert((ss - stub) < stub_size);
#ifdef ORP_VTUNE_SUPPORT
    //M: 
    vtune_notify_stub_load_finished("getaddress__gc_write_barrier_fastcall",(Byte*) stub,ss-stub);
#endif
    return addr;
} //getaddress__gc_write_barrier_fastcall



void *orp_get_rt_support_addr(ORP_RT_SUPPORT f)
{
    switch(f) {
    case ORP_RT_NULL_PTR_EXCEPTION:
        return getaddress__orp_exception_nullpointer_naked();
    case ORP_RT_IDX_OUT_OF_BOUNDS:
        return getaddress__orp_exception_arrayindexoutofbounds_naked();
    case ORP_RT_ARRAY_STORE_EXCEPTION:
        return getaddress__orp_exception_arraystore_naked();
    case ORP_RT_ATHROW:
        return getaddress__orp_athrow_naked();
    case ORP_RT_ATHROW_LAZY:
        return getaddress__orp_athrow_lazy_naked();
    case ORP_RT_LDC_STRING:
        return getaddress__orp_instantiate_cp_string_naked();
    case ORP_RT_NEW_RESOLVED:
        return getaddress__orp_alloc_java_object_resolved_naked();
    case ORP_RT_NEW_WITH_FINALIZER_RESOLVED:
        return getaddress__orp_alloc_java_object_with_finalizer_resolved_naked();
    case ORP_RT_ANEWARRAY_RESOLVED:
        return getaddress__orp_anewarray_resolved_naked();
    case ORP_RT_NEWARRAY:
        return getaddress__orp_newarray_naked();
    case ORP_RT_MULTIANEWARRAY_RESOLVED:
        return getaddress__orp_multianewarray_resolved_naked();
    case ORP_RT_NEW_VECTOR:
        return getaddress__orp_new_vector_naked();
    case ORP_RT_AASTORE:
        return (void *)orp_aastore;
    case ORP_RT_F2I:
        return (void *)orp_f2i;  // need to get Ken's wait opcode fixes in x86.h
    case ORP_RT_F2L:
        return (void *)orp_f2l;
    case ORP_RT_D2I:
        return (void *)orp_d2i; // need to get Ken's wait fix
    case ORP_RT_D2L:
        return (void *)orp_d2l; 
    case ORP_RT_LSHL:
        return getaddress__orp_lshl_naked();
    case ORP_RT_LSHR:
        return getaddress__orp_lshr_naked();
    case ORP_RT_LUSHR:
        return getaddress__orp_lushr_naked();
    case ORP_RT_FREM:
        return getaddress__orp_frem();
    case ORP_RT_DREM:
        return getaddress__orp_drem();
    case ORP_RT_LMUL:
        return (void *)orp_lmul;
#ifdef ORP_LONG_OPT
	case ORP_RT_LMUL_CONST_MULTIPLIER:
		return (void *)orp_lmul_const_multiplier;
#endif
    case ORP_RT_LREM:
        return getaddress__orp_lrem_naked();
    case ORP_RT_LDIV:
        return getaddress__orp_ldiv_naked();
	case ORP_RT_CONST_LDIV:
		return getaddress__orp_const_ldiv_naked() ;
	case ORP_RT_CONST_LREM:
		return getaddress__orp_const_lrem_naked() ;
    case ORP_RT_DDIV:
        return (void *)orp_rt_ddiv;
    case ORP_RT_WRITE_BARRIER_ATOMIC:
        return (void *)gc_write_barrier_atomic;
    case ORP_RT_WRITE_BARRIER:
        return (void *)gc_write_barrier;
    case ORP_RT_WRITE_BARRIER_FASTCALL:
        return getaddress__gc_write_barrier_fastcall();

#ifdef STAT_INDIRECT_CALL
	case ORP_RT_STAT_INDIRECT_CALL:
		//::for debug
		//void count_stat_indirect_call(unsigned mh) ;
		//return count_stat_indirect_call ;
		//::
		void *getaddress__orp_stat_indirect_call_naked() ;
		return getaddress__orp_stat_indirect_call_naked() ;
#endif

#ifdef ORP_POSIX
#else
#ifdef JIT_SAPPHIRE
     case ORP_RT_GC_HEAP_REF_EQUAL:
         return gc_sapphire_equal;
	//::
	//Read barrier
	//::
	case ORP_RT_GC_READ_BARRIER_DEBUG:
    case ORP_RT_GC_HEAP_READ_REF:
        return gc_heap_read_ref;
    case ORP_RT_GC_HEAP_READ_INT8:
        return gc_heap_read_int8;
    case ORP_RT_GC_HEAP_READ_INT16:
        return gc_heap_read_int16;
    case ORP_RT_GC_HEAP_READ_UINT16:
        return gc_heap_read_uint16;
    case ORP_RT_GC_HEAP_READ_INT32:
        return gc_heap_read_int32;
    case ORP_RT_GC_HEAP_READ_FLOAT:
        return gc_heap_read_float;
    case ORP_RT_GC_HEAP_READ_DOUBLE:
        return gc_heap_read_double;
    case ORP_RT_GC_HEAP_READ_INT64:
        return gc_heap_read_int64;
    case ORP_RT_GC_HEAP_READ_POINTER_SIZE_INT:
        return gc_heap_read_pointer_size_int;
    case ORP_RT_GC_HEAP_READ_GLOBAL_SLOT:
        return gc_heap_read_global_slot;
    case ORP_RT_GC_VOLATILE_HEAP_READ_REF:
        return gc_volatile_heap_read_ref;
    case ORP_RT_GC_VOLATILE_HEAP_READ_INT8:
        return gc_volatile_heap_read_int8;
    case ORP_RT_GC_VOLATILE_HEAP_READ_INT16:
        return gc_volatile_heap_read_int16;
    case ORP_RT_GC_VOLATILE_HEAP_READ_UINT16:
        return gc_volatile_heap_read_uint16;
    case ORP_RT_GC_VOLATILE_HEAP_READ_INT32:
        return gc_volatile_heap_read_int32;
    case ORP_RT_GC_VOLATILE_HEAP_READ_FLOAT:
        return gc_volatile_heap_read_float;
    case ORP_RT_GC_VOLATILE_HEAP_READ_DOUBLE:
        return gc_volatile_heap_read_double;
    case ORP_RT_GC_VOLATILE_HEAP_READ_INT64:
        return gc_volatile_heap_read_int64;
    case ORP_RT_GC_VOLATILE_HEAP_READ_POINTER_SIZE_INT:
        return gc_volatile_heap_read_pointer_size_int; 
#endif
#endif

    case ORP_RT_CHECKCAST:
        return getaddress__orp_checkcast_naked();
    case ORP_RT_INSTANCEOF:
        return (void *)orp_instanceof;
    case ORP_RT_INSTANCEOF_CLASS:
        return (void *)orp_instanceof_class;
    case ORP_RT_MONITOR_ENTER:
    case ORP_RT_MONITOR_ENTER_STATIC:
//#ifdef ORP_STATS
//        return (void *)monitor_enter_instrumented;
//#else
        return getaddress__orp_monitor_enter_naked();
//#endif

    case ORP_RT_MONITOR_EXIT:
    case ORP_RT_MONITOR_EXIT_STATIC:
//#ifdef ORP_STATS
//        return (void *)monitor_exit_instrumented;
//#else
        return getaddress__orp_monitor_exit_naked();
//#endif

    case ORP_RT_GET_INTERFACE_VTABLE_VER0:
        return getaddress__orp_get_interface_vtable_old_naked();  //tryitx
    case ORP_RT_INITIALIZE_CLASS:
        return getaddress__orp_initialize_class_naked();
    case ORP_RT_THROW_LINKING_EXCEPTION:
        return getaddress__orp_throw_linking_exception_naked();
    case ORP_RT_IMUL:
        return (void *)orp_rt_imul_common;
    case ORP_RT_IDIV:
        return (void *)orp_rt_idiv_common;
    case ORP_RT_IREM:
        return (void *)orp_rt_irem_common;
    case JVMDI_RT_EVENT_METHOD_ENTRY:
        return getaddress__jvmdi_method_entry_event();
    case JVMDI_RT_EVENT_METHOD_EXIT:
        return getaddress__jvmdi_method_exit_event();
    default:
        break;
    }
    assert(0);
    return 0;
} //orp_get_rt_support_addr



