// Copyright (C)  2000 Intel Corporation.  All rights reserved.
//
// $Header: /usr/development/orp/orp/arch/ia32/ia32_o1_jit/lazy_code_selector.cpp,v 1.8 2001/12/11 16:20:26 rlhudson Exp $
//

#include "defines.h"
#include "jit_intf.h"
#include "internal_jit_intf.h"
#include "jit_runtime_support.h"
#include <iostream.h>
#include <stdarg.h>
#include "code_emitter.h"
#include "data_emitter.h"
#include "cg_prepass.h"
#include "operand.h"
#include "lazy_code_selector.h"
#include "register_manager.h"
#include "stack.h"
#include "cg_field_access.h"
#include "cg_constant.h"
#include "cg_method_invocation.h"
#include "cg_helper.h"
#include "cg_array_access.h"
#include "cg_shift.h"
#include "cg_mul_div_mod.h"
#include "cg_conversion.h"
#include "cmp_branch.h"
#include "cg_dup.h"
#include "jit.h"
#include "gc_tags.h"
#include "cg_load_store.h"
#include "regalloc.h"
#include "register_allocator.h"
#include "profiling.h"
#include "disasm_intf.h"
#include "dump_jit.h"
#include "fp_compatibility.h"
#include "o1_debugging_support.h"


// #define _TRACE


#ifndef NO_BOUNDS_CHECKING
#include "bounds_checking.h"
#endif // NO_BOUNDS_CHECKING

#ifdef VTune_Support
#include "vtune.h"
#endif // VTune_Support

#if 0
unsigned ken_total_gen_mi = 0;   // KEN testing
unsigned ken_total_gen_code = 0; // KEN testing
unsigned ken_total_gen_const_data = 0;
unsigned ken_total_gen_data = 0;
unsigned ken_total_gen_info = 0;
unsigned ken_total_gen = 0;
#endif // 0

//
// static constants for fconst and dconst bytecodes
//
float FCONST0 = 0.0F, FCONST1 = 1.0F, FCONST2 = 2.0F;
double DCONST0 = 0.0, DCONST1 = 1.0;

bool L1a_do_dumpjit = true;

// Given an argument list handle, return the type of the argument.
// This function also does an implicit get next arg handle on the
// Arg_List_Iterator. At the end of the list, it returns JAVA_TYPE_UNDEF.
static Java_Type getArgumentType(
                                 Arg_List_Iterator  &argList)      /*INOUT*/
{
    Java_Type curr = curr_arg(argList);
    argList = advance_arg_iterator(argList);
    return curr;
}


X86_CC branch_cc[] = {cc_eq, cc_ne, cc_lt, cc_ge, cc_gt, cc_le};

void Call_Patch::apply(char *code_block) {
    char *inst = code_block + offset;
    Imm_Opnd(target - inst - 4).emit32(inst);
}

void Branch_Patch::apply(char *code_block) {
    char *inst = code_block + offset;
    Imm_Opnd(target_offset - offset - 4).emit32(inst);
}

void Table_Entry_Patch::apply(char *code_block,char *data_block) {
    char *target = code_block + target_offset;
    Imm_Opnd((unsigned)(target)).emit32(data_block + data_offset);
}
void Mov_Patch::apply(char *code_block) {
    char *inst = code_block + offset;
    Imm_Opnd(((unsigned)inst) + target_offset - offset).emit32(inst);
}


//
// most x86 insts destroy dst (dst = dst op src)
// we have to make sure that dst operand is a scratch reg
//
void make_dst_killable(Mem_Manager& mem_manager, Code_Emitter& emitter, Stack& stack,
                       Operand*& dst, Operand*& src,int is_commutable) {
    unsigned local_regs = stack.reg_manager.local_regs();
    // if src is a local reg, then it is okay to overwrite it
    if (is_commutable && src->is_reg() && src->hold_local_reg(local_regs)) {
        Operand *tmp_opnd = src;
        src = dst;
        dst = tmp_opnd;
    }
    // 
    // make dst killable (copy to a scratch reg if necessary)
    //
    Reg_Operand *dst_reg;
    if (!dst->is_reg() ||  // dst cannot be overwritten
        !dst->hold_local_reg(local_regs)) {
        dst->free_opnd(&stack.reg_manager);
        dst_reg = stack.reg_manager.get_reg();
        // 
        // free up registers if we run out of registers
        //
        if (!dst_reg) {
            src->free_opnd(&stack.reg_manager);
            Reg_Operand *base = stack.reg_manager.get_reg();
#ifdef _DEBUG
            if(src->kind != Operand::Array || !base) {
                cout << "BUG: src is not an array or no reg" << endl;
                assert(0);
            }
#endif // _DEBUG
            ((Mem_Operand*)src)->emit_lea(emitter,&base->opnd);
            src = new (mem_manager) Field_Operand(base->opnd.reg_no(),0);
            dst_reg = stack.reg_manager.get_reg();
            assert(dst_reg);
        }
        
        dst->emit_mov_to_reg(emitter,&dst_reg->opnd);
        dst = dst_reg;
    } 
}

Operand *emit_alu(Mem_Manager& mem_manager,
                  Code_Emitter& emitter,
                  Stack& stack,
                  X86_ALU_Opcode opc,
                  Operand *dst,
                  Operand *src,
                  unsigned is_commutable) {
    make_dst_killable(mem_manager,emitter,stack,dst,src,is_commutable);
    //
    // destination is definitely a scratch register
    //
    src->emit_alu_inst(emitter,&((Reg_Operand*)dst)->opnd,opc);
    src->free_opnd(&stack.reg_manager);
    return dst;
}

void emit_alu32(Mem_Manager& mem_manager,Code_Emitter& emitter,Stack& stack,
                X86_ALU_Opcode opc,unsigned is_commutable) {
    Operand *src = stack.pop();
    Operand *dst = stack.pop();
    dst = emit_alu(mem_manager,emitter,stack,opc,dst,src,is_commutable);
    stack.push(dst);
}

//
// code is same as emit_alu except dst is not freed because we still need to
// access the hi part of the operand later.
//
Operand *emit_alu64_lo(Code_Emitter& emitter,
                       Stack& stack,
                       X86_ALU_Opcode opc,
                       Operand *dst,
                       Operand *src,
                       unsigned is_commutable) {
    unsigned local_regs = stack.reg_manager.local_regs();
    if (is_commutable && src->is_reg() && src->hold_local_reg(local_regs)) {
        Operand *tmp_opnd = src;
        src = dst;
        dst = tmp_opnd;
    }
    
    // make dst killable (copy to a scratch reg if necessary)
    Reg_Operand *dst_reg;
    if (!dst->is_reg() || !dst->hold_local_reg(local_regs)) {
        dst_reg = stack.reg_manager.get_reg();
        assert(dst_reg);
        dst->emit_mov_to_reg(emitter,&dst_reg->opnd);
    } else
        dst_reg = (Reg_Operand*)dst;
    
    // destination is definitely a scratch register
    src->emit_alu_inst(emitter,&dst_reg->opnd,opc);
    if (src->is_reg()) 
        src->free_opnd(&stack.reg_manager);
    return (Operand*)dst_reg;
}
//
//         |             |
//         +-------------+
//         | [ecx + 4]   | lo
// Field   +-------------+
//         | [ecx + 8]   | hi
//         +-------------+
//         | [ecx + edx] | lo
// Array   +-------------+
//         | [ecx + edx] | hi
//         +-------------+
// In this case, all three caller-save regs are used by the two operands.  
// We want to spill one of them to free up regs because alu64 needs one 
// extra reg if both operands are mem ( alu_op eax, mem).
//
void need_one_callee_for_alu64(Code_Emitter& emitter, Stack& stack,
                               Pre_Alloc_Operand_Pool& op_pool,
                               Operand*& src_lo,Operand*& src_hi,
                               Operand*& dst_lo,Operand*& dst_hi,
                               unsigned is_commutable) {
    if ((is_commutable && !src_lo->is_mem()) || !dst_lo->is_mem()) return;
    Reg_Operand *reg;
    reg = stack.reg_manager.get_reg();
    if (reg) {
        reg->free_opnd(&stack.reg_manager);
        return;
    }
    assert(src_lo->kind != Operand::Stk && dst_lo->kind != Operand::Stk); 
    //
    // choose one callee reg that is not used by both operands for spilling
    //
    X86_Reg_No r = ((Mem_Operand*)dst_lo)->pick_a_callee_reg();
    int spill_dst = 1;
    if (src_lo->contain(r)) {
        r = ((Mem_Operand*)src_lo)->pick_a_callee_reg();
        assert(!dst_lo->contain(r));
        spill_dst = 0;
    }
    Reg_Operand callee(r);
    Stack_Operand *spill_loc, *stk_lo, *stk_hi;
    // spill one operand
    if (spill_dst) {
        // use src_lo's spill loc
        spill_loc = op_pool.nth_stack(stack.depth()+3);
        stk_lo    = op_pool.nth_stack(stack.depth()+1);
        stk_hi    = op_pool.nth_stack(stack.depth());
        callee.emit_mov_to_mem(emitter,&spill_loc->opnd);
        dst_lo->emit_mov_to_reg(emitter,&callee.opnd);
        callee.emit_mov_to_mem(emitter,&stk_lo->opnd);
        dst_hi->emit_mov_to_reg(emitter,&callee.opnd);
        callee.emit_mov_to_mem(emitter,&stk_hi->opnd);
        dst_lo->free_opnd(&stack.reg_manager);
        dst_hi->free_opnd(&stack.reg_manager);
        dst_lo = stk_lo;
        dst_hi = stk_hi;
    } else {
        // use dst_lo's spill loc
        spill_loc = op_pool.nth_stack(stack.depth()+1);
        stk_lo    = op_pool.nth_stack(stack.depth()+3);
        stk_hi    = op_pool.nth_stack(stack.depth()+2);
        callee.emit_mov_to_mem(emitter,&spill_loc->opnd);
        src_lo->emit_mov_to_reg(emitter,&callee.opnd);
        callee.emit_mov_to_mem(emitter,&stk_lo->opnd);
        src_hi->emit_mov_to_reg(emitter,&callee.opnd);
        callee.emit_mov_to_mem(emitter,&stk_hi->opnd);
        src_lo->free_opnd(&stack.reg_manager);
        src_hi->free_opnd(&stack.reg_manager);
        src_lo = stk_lo;
        src_hi = stk_hi;
    }
    spill_loc->emit_mov_to_reg(emitter,&callee.opnd);
}
void emit_alu64(Mem_Manager& mem_manager,
                Code_Emitter& emitter,Stack& stack,
                Pre_Alloc_Operand_Pool& op_pool,
                X86_ALU_Opcode opc_lo,X86_ALU_Opcode opc_hi,
                unsigned is_commutable) {
    Operand *src_lo, *src_hi, *dst_lo, *dst_hi, *res_lo, *res_hi;
    stack.pop64(src_lo,src_hi);
    stack.pop64(dst_lo,dst_hi);
    
    need_one_callee_for_alu64(emitter,stack,op_pool,src_lo,src_hi,dst_lo,dst_hi,is_commutable);
    res_lo = emit_alu64_lo(emitter,stack,opc_lo,dst_lo,src_lo,is_commutable);
    res_hi = emit_alu(mem_manager,emitter,stack,opc_hi,dst_hi,src_hi,is_commutable);
    stack.push64(res_lo,res_hi);
}

X86_CC emit_cmp(Code_Emitter& emitter, Stack& stack, X86_CC cc) {
    Operand *op2 = stack.pop();
    Operand *op1 = stack.pop();
    //
    // To do: hack! need to be fixed later
    //
    if (op2->is_mem() && op1->is_mem()) {
        if (op1->kind == Operand::Array) {
            op1->free_opnd(&stack.reg_manager);
            Reg_Operand *reg = stack.reg_manager.get_reg();
            op1->emit_mov_to_reg(emitter,&reg->opnd);
            op1 = reg;
        } else {
            op2->free_opnd(&stack.reg_manager);
            Reg_Operand *reg = stack.reg_manager.get_reg();
            op2->emit_mov_to_reg(emitter,&reg->opnd);
            op2 = reg;
        }
    }
    stack.home_all(); // make sure all operands are spilled at the end of bb
    unsigned commute = 0;
    if (!op1->is_reg()) {
        // m r, i m, m m, i r, m i, i i
        if (op2->is_reg() || op2->is_mem()) {
            // m r --> r m
            // i m --> m i
            // m m --> m m
            // i r --> r i
            // commute
            commute = 1;
            Operand *o = op1;
            op1 = op2;
            op2 = o;
        } 
        // m i, i i are untouched
    }
    if (op1->is_reg()) {
        // r r, r m, r i
        Reg_Operand *op1_reg = (Reg_Operand*)op1;
        op2->emit_alu_inst(emitter,&op1_reg->opnd,cmp_opc);
    } else if (op1->is_mem()) {
        // m m, m i
        // load op2 into register and make into m r
        op2->free_opnd(&stack.reg_manager);
        Reg_Operand *reg = stack.reg_manager.get_reg();
        op2->emit_mov_to_reg(emitter,&reg->opnd);
        // m r --> r m
        op1->emit_alu_inst(emitter,&reg->opnd,cmp_opc);
        reg->free_opnd(&stack.reg_manager);
        commute = !commute;
    } else {
        // i i
        op1->free_opnd(&stack.reg_manager);
        Reg_Operand *reg = stack.reg_manager.get_reg();
        op1->emit_mov_to_reg(emitter,&reg->opnd);
        op2->emit_alu_inst(emitter,&reg->opnd,cmp_opc);
        reg->free_opnd(&stack.reg_manager);
    }
    if (commute)
        return cc_commute_map[cc];
    return cc;
}

void emit_cmp_zero(Code_Emitter& emitter,Stack& stack, unsigned char bytecode) {
    Operand *src = stack.pop();
    //
    // spill all stack locations at the end of a block
    //
    stack.home_all();
    //
    // load src into a register reg and emit test reg,reg
    //
    // we may do better with a direct cmp
    //
    Reg_Operand *dst_reg;
    if (!src->is_reg()) {
        src->free_opnd(&stack.reg_manager);
        dst_reg = stack.reg_manager.get_reg();
        src->emit_mov_to_reg(emitter,&dst_reg->opnd);
    } else {
        dst_reg = (Reg_Operand*)src;
    }
    if ( bytecode==0x99 || bytecode==0x9a ||
        bytecode==0xc6 || bytecode==0xc7) {
        //
        // if{eq,ne,null,nonnull}
        //
        emitter.emit_test(&dst_reg->opnd,&dst_reg->opnd);
    } else
        emitter.emit_alu(cmp_opc,&dst_reg->opnd,&Imm_Opnd(0));
    dst_reg->free_opnd(&stack.reg_manager);
}

void emit_jump(Code_Emitter& emitter,
               int target_offset,
               unsigned target_index) {
    if (target_offset < 0) {
        //
        // backward branches do not require patching
        //
    } else {
        //
        // emit a branch with 32-bit offset that is later patched
        //
        emitter.emit_jump32(&Imm_Opnd(0));
    }
}

void GC_Tags_emit_clear(Frame& frame,Code_Emitter& emitter,CG_Prepass& prepass,unsigned index) {
    if (prepass.aloaded_vars[index] != 0) {
        //
        // and	offset[ebp, esp],~mask
        //
        // note that since we are clearing a bit, we use the
        // complement of the mask
        //
        unsigned mask = ~GC_Tags_mask(index);
        M_Base_Opnd opnd(frame.base_reg,
            frame.extra_offset(GC_Tags_word_no(index)));
        emitter.emit_alu(and_opc,&opnd,&Imm_Opnd(mask));
    }
}

void GC_Tags_emit_set(Frame& frame,Code_Emitter& emitter,CG_Prepass& prepass,unsigned index) {
    if (prepass.aloaded_vars[index] != 0) {
        //
        // or	offset[ebp, esp],mask
        //
        unsigned mask = GC_Tags_mask(index);
        M_Base_Opnd opnd(frame.base_reg,
            frame.extra_offset(GC_Tags_word_no(index)));
        emitter.emit_alu(or_opc,&opnd,&Imm_Opnd(mask));
    }
}

void GC_Tags_emit_init(Frame& frame,Code_Emitter& emitter,
                       CG_Prepass& prepass,Method_Handle handle,
                       Mem_Manager& mem_manager) {
    //
    // if no variables are aloaded, then we don't need to do anything
    //
    if (prepass.n_aloaded == 0) 
        return;
    unsigned n_words = GC_Tags_n_words(frame.n_args+frame.n_vars);
    unsigned *gc_tag_words = (unsigned*)mem_manager.alloc(n_words<<2);
    unsigned i;
    for (i = 0; i < n_words; i++) {
        gc_tag_words[i] = 0;
    }
    //
    // for virtual functions, we need to initialize arg0 explicitly
    //
    i = 0;
    if (!method_is_static(handle)) {
        gc_tag_words[0] = 1;
        i++;
    }
    //
    // initialize gc tags for incoming arguments
    //
    Arg_List_Iterator args = method_get_argument_list(handle);
    Java_Type type;
    while ((type = getArgumentType(args)) != JAVA_TYPE_END) {
        if ((type == JAVA_TYPE_CLASS || type == JAVA_TYPE_ARRAY) 
            && prepass.aloaded_vars[i]) {
            unsigned word_no = GC_Tags_word_no(i);
            unsigned mask = GC_Tags_mask(i);
            gc_tag_words[word_no] |= mask;
        }
        if (type == JAVA_TYPE_LONG || type == JAVA_TYPE_DOUBLE)
            i++;
        i++;
    }
    for (i=0; i < n_words; i++) {
        M_Base_Opnd opnd(frame.base_reg,
            frame.extra_offset(i));
        emitter.emit_mov(&opnd,&Imm_Opnd(gc_tag_words[i]));
    }
}

void emit_prolog(Code_Emitter& emitter,
                 Frame&	frame,
                 unsigned callee_saved_regs,
                 Method_Handle method_handle,
                 Mem_Manager &mem_manager,
                 CG_Prepass &prepass,
                 Code_Patch *& code_patch_list,
                 Jit_Method_Info *method_info,
                 Register_Allocator *regalloc,
                 const char *method_name,
                 Stack& stack,
                 Profile_Rec* prof_rec,
                 Profile_Patch& prof_patch) {
    //
    // we use esp-based stack frames only, so the prolog looks like:
    //
    //	push	ebp			-- EBP frame only
    //	mov		ebp,esp		-- EBP frame only
    //	sub		esp,n_locals+n_spill_words
    //	push	ebx
    //	push	ebp			-- ESP frame only
    //	push	esi
    //	push	edi
    //	mov		gc_tags, init
    //	< ... vtune call graph support ... >
    //	< ... monitorenter ... >	-- synchronized methods only
    //
    
    unsigned frame_size = (frame.n_vars + frame.n_spill + frame.n_extra) << 2;
    if (frame.base_reg == ebp_reg) {
        emitter.emit_push(&ebp_opnd);
        emitter.emit_mov(&ebp_opnd,&esp_opnd);
        // clear ebp in callee saved mask so it isn't later pushed
        callee_saved_regs &= ~callee_saved_ebp_mask;
    }
    if (frame_size > 0)
        emitter.emit_alu(sub_opc,&esp_opnd,&Imm_Opnd(frame_size));
    
    //
    // push callee-saved registers
    //
    if (callee_saved_regs & callee_saved_ebx_mask)
        emitter.emit_push(&ebx_opnd);
    if (callee_saved_regs & callee_saved_ebp_mask)
        emitter.emit_push(&ebp_opnd);
    if (callee_saved_regs & callee_saved_esi_mask)
        emitter.emit_push(&esi_opnd);
    if (callee_saved_regs & callee_saved_edi_mask)
        emitter.emit_push(&edi_opnd);
    
    //
    // initialize GC tags for incoming arguments. this must come before the
    // monitorenter for a synchronized method in case garbage collection happens
    // at that call site.
    //
#ifdef VAR_CLONING
    if(!IS_INVALID_RV_BITMAP(method_info->ref_var_bitmap)) {
        // First set all reference variables to null.  Make sure you don't overwrite arguments!
        R_Opnd *zero_opnd = &eax_opnd;
        unsigned n_vars = frame.n_vars;
        unsigned idx_base = method_info->num_in_args;
        char *local_var_info = prepass.local_var_info;
        bool ref_vars_exist = false;
        bool esi_assigned = false;
        bool edi_assigned = false;
        bool ebp_assigned = false;
        bool ebx_assigned = false;
        for(unsigned i = 0; i < n_vars; i++) {
            unsigned idx = idx_base + i;
            unsigned lvi_int = local_var_info[idx];
            Local_Var_Info &lvi = *((Local_Var_Info*)(&lvi_int));
            if(lvi.ref_var) {
                ref_vars_exist = true;
                X86_Reg_No reg = regalloc->reg_var_allocated_to(idx, NULL/*XXX-hack*/);
                if (reg != n_reg) {
                    if (reg == esi_reg) {
                        esi_assigned = true;
                        zero_opnd = &esi_opnd;
                    } else if (reg == edi_reg) {
                        edi_assigned = true;
                        zero_opnd = &edi_opnd;
                    } else if (reg == ebx_reg) {
                        ebx_assigned = true;
                        zero_opnd = &ebx_opnd;
                    } else if (reg == ebp_reg) {
                        ebp_assigned = true;
                        zero_opnd = &ebp_opnd;
                    }
                }
            }
        }
        
        if(ref_vars_exist) {
            emitter.emit_alu(xor_opc, zero_opnd, zero_opnd);
            for(unsigned i = 0; i < n_vars; i++) {
                unsigned idx = idx_base + i;
                unsigned lvi_int = local_var_info[idx];
                Local_Var_Info &lvi = *((Local_Var_Info*)(&lvi_int));
                if(lvi.ref_var) {
                    X86_Reg_No reg = regalloc->reg_var_allocated_to(idx, NULL/*XXX-hack*/);
                    if (reg != n_reg) {
                        if (reg == esi_reg) {
                            if(zero_opnd != &esi_opnd)
                                emitter.emit_mov(&esi_opnd, zero_opnd);
                        } else if (reg == edi_reg) {
                            if(zero_opnd != &edi_opnd)
                                emitter.emit_mov(&edi_opnd, zero_opnd);
                        } else if (reg == ebx_reg) {
                            if(zero_opnd != &ebx_opnd)
                                emitter.emit_mov(&ebx_opnd, zero_opnd);
                        } else if (reg == ebp_reg) {
                            if(zero_opnd != &ebp_opnd)
                                emitter.emit_mov(&ebp_opnd, zero_opnd);
                        } else {
                            assert(0);
                        }
                    } else {
                        emitter.emit_mov(&M_Var_Opnd(frame, idx), zero_opnd);
                    }
                }
            }
        }
    }
    
    // We have to initialize GC_TAGS only if we couldn't disambiguate variables.
#endif //VAR_CLONING
    if(IS_INVALID_RV_BITMAP(method_info->ref_var_bitmap))
        GC_Tags_emit_init(frame,emitter,prepass,method_handle,mem_manager);
    
    if (method_info->is_esp_based) {
        for (unsigned in = 0; in < frame.n_args; in++) {
            X86_Reg_No reg = regalloc->reg_var_allocated_to(in, NULL/*XXX-hack*/);
            if (reg != n_reg && regalloc->var_is_live_on_entry(in))
                gen_reg_store32(emitter,stack,reg,&Mem_Var_Operand(frame,in,0));
        }
    }
    //
    // emit instrumenting code (method entry)
    //
    inserting_instrumenting_code(emitter, prof_patch, (unsigned)-1, (unsigned*)&prof_rec->m_entry);

    //
    // insert a call for JVMDI_EVENT_METHOD_ENTRY
    //
    if (jvmdi_support) {
        gen_jvmdi_event_method(mem_manager,
                               emitter, 
                               stack,
                               code_patch_list,
                               method_info,
                               true);
    }

#ifdef VTune_Support_CALLGRAPH
    if (VTuneModeFlags & iJIT_BE_NOTIFY_ON_METHOD_ENTRY) {
        unsigned patch_offset = emitter.get_offset()+1;
        emitter.emit_call((char*) iJIT_MethodEntered);
        code_patch_list =
            new(mem_manager) Call_Patch(code_patch_list,patch_offset,(char*) iJIT_MethodEntered);
    }
#endif // VTune_support
    
    //
    // WARNING!!!!!!!!! WARNING!!!!!!!!! WARNING!!!!!!!!! WARNING!!!!!!!!! WARNING!!!!!!!!!
    //
    // Must do monitorenter (synchronized methods) here. Garbage collection assumes
    // that a frame exists. If GC happens at the call to monitorenter, then the frame
    // must be active!!!
    //
    // WARNING!!!!!!!!! WARNING!!!!!!!!! WARNING!!!!!!!!! WARNING!!!!!!!!! WARNING!!!!!!!!!
    //
    
    DWORD method_flags = method_get_flags(method_handle);
    if (method_flags & ACC_SYNCHRONIZED) {
        if (method_is_static(method_handle)) {
            // synchronized static method
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_synch_method_enter(mem_manager,emitter,
                code_patch_list,
                frame.n_args,method_info,method_handle);
        } else {
            // synchronized virtual method
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_synch_method_enter(mem_manager,emitter,
                code_patch_list,
                frame.n_args,method_info);
        }
    }
}

void emit_epilog(Code_Emitter& emitter,
                 Frame&	frame,
                 unsigned callee_saved_regs,
                 unsigned num_int_return,
                 Method_Handle method_handle,
                 DWORD method_flags,
                 Mem_Manager& mem_manager,
                 Code_Patch *& code_patch_list,
                 Jit_Method_Info *method_info,
                 Stack &stack,
                 CG_Prepass &prepass,
                 const unsigned char *ret_bc) {
    //
    //	stack.home_all()
    //  < ... call MonExit ...>         -- for synchronized method
    //	< ... call VTune ... >			-- Vtune call graph support
    //	mov ret val to ret reg
    //	pop edi
    //	pop esi
    //	pop ebp							-- ESP frame only
    //	pop ebx
    //	mov esp, ebp					-- EBP frame only
    //	pop ebp							-- EBP frame only
    //	add	esp, n_locals+n_spill_words	-- ESP frame only
    //
    unsigned frame_size = (frame.n_vars + frame.n_spill + frame.n_extra) << 2;
    
    //
    // set the offset of the return here because home_all() emits code
    //
    if (method_flags & ACC_SYNCHRONIZED) {
        method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
    }
    Java_Type ret_type = method_get_return_type(method_handle);
    //
    // must make sure that monitorexit and vtune support do not clobber
    // the return values
    //
    if ((method_flags & ACC_SYNCHRONIZED) 
#ifdef VTune_Support
        || (VTuneModeFlags & iJIT_BE_NOTIFY_ON_METHOD_EXIT)
#endif
        ) {
        stack.home_all();
    }
    //
    // WARNING!!!!!!!!! WARNING!!!!!!!!! WARNING!!!!!!!!! WARNING!!!!!!!!! WARNING!!!!!!!!!
    //
    // Must do monitorexit (synchronized methods) here. Garbage collection assumes
    // that a frame exists. If GC happens at the call to monitorexit, then the frame
    // must still be active!!!
    //
    // WARNING!!!!!!!!! WARNING!!!!!!!!! WARNING!!!!!!!!! WARNING!!!!!!!!! WARNING!!!!!!!!!
    //
    
    if (method_flags & ACC_SYNCHRONIZED) {
        if (method_is_static(method_handle)) {
            // synchronized static method
            gen_synch_method_exit(mem_manager,emitter,
                code_patch_list,
                frame.n_args + num_int_return,
                method_info,
                method_handle);
        } else {
            // synchronized virtual method
            gen_synch_method_exit(mem_manager,emitter,
                code_patch_list,
                frame.n_args + num_int_return,
                method_info);
        }
    }
    
#ifdef VTune_Support_CALLGRAPH
    if (VTuneModeFlags & iJIT_BE_NOTIFY_ON_METHOD_EXIT) {
        
        unsigned patch_offset = emitter.get_offset()+1;
        emitter.emit_call((char*) iJIT_MethodExited);
        code_patch_list =
            new(mem_manager) Call_Patch(code_patch_list,patch_offset,(char*) iJIT_MethodExited);
    }
#endif // VTune_Support

    //
    // insert a call for JVMDI_EVENT_METHOD_ENTRY
    //
    if (jvmdi_support) {
        gen_jvmdi_event_method(mem_manager,
                               emitter, 
                               stack,
                               code_patch_list,
                               method_info,
                               false);
    }

//
    // move return values from mimic stack to registers
    //
    
    //
    // create garbage collection info
    //
    if ((ret_type == JAVA_TYPE_CLASS || ret_type == JAVA_TYPE_ARRAY) &&
        (method_flags & ACC_SYNCHRONIZED)) {
        Bit_Vector *bv = new (mem_manager) Bit_Vector(stack.depth(),mem_manager);
        bv->set(stack.depth() - 1);
        Call_BV_List_Element *bvle = new (mem_manager) Call_BV_List_Element(bv,ret_bc);
        prepass.gc_site_vectors->push(bvle);
    }
    
    switch(ret_type) {
    case JAVA_TYPE_BYTE: // signed
    case JAVA_TYPE_CHAR: // character
    case JAVA_TYPE_INT: // integer
    case JAVA_TYPE_SHORT: // 16-bit signed short
    case JAVA_TYPE_BOOLEAN: // boolean
    case JAVA_TYPE_CLASS: // object
    case JAVA_TYPE_ARRAY: // array
        // 32-bit result in eax
        {
            Operand *src = stack.pop();
            stack.maintain_precise_exception();
            src->emit_mov_to_reg(emitter,&eax_opnd);
        }
        break;
    case JAVA_TYPE_LONG: // long
        // 64-bit result in edx:eax pair
        {
            Operand *src_lo, *src_hi;
            stack.pop64(src_lo,src_hi);
            stack.maintain_precise_exception();
            if (!(src_lo->is_reg() && src_lo->contain(eax_reg) && // src_lo is not eax
                src_hi->is_reg() && src_hi->contain(edx_reg))) {// src_hi is not edx
                stack.push64(src_lo,src_hi);
                //
                // if we don't home_all, then we need to worry about shuffle value 
                // so that result ends up in edx:eax pair
                //
                stack.home_all();
                stack.pop64(src_lo,src_hi);
                src_lo->emit_mov_to_reg(emitter,&eax_opnd);
                src_hi->emit_mov_to_reg(emitter,&edx_opnd);
            }
        }
        break;
    case JAVA_TYPE_FLOAT: // single FP
        {
            
            // 32-bit result on FP stack
            Operand *src = stack.pop();
            stack.maintain_precise_exception();
            if (stack.fp_strict_mode)
            {
                assert(src->is_mem());
                Mem_Operand *m_src = (Mem_Operand*)src;
                emitter.emit_fld(m_src->mem_opnd(),0);
            }
            else
            {
                if (src->is_mem()){
                    Mem_Operand *m_src = (Mem_Operand*)src;
                    emitter.emit_fld(m_src->mem_opnd(),0);
                    stack.fp_inc_cnt(); 
                }
                result_on_fp_stack(mem_manager, stack, false);
            }
            break;
        }
    case JAVA_TYPE_DOUBLE: // double FP
        // 64-bit result on FP stack
        {
            Operand *src_lo;
            Operand *src_hi;
            stack.pop64(src_lo,src_hi);
            stack.maintain_precise_exception();
            
            if (stack.fp_strict_mode)
            {
                assert(src_lo->is_mem());
                assert(src_hi->is_mem());
                Mem_Operand *m_src = (Mem_Operand*)src_lo;
                emitter.emit_fld(m_src->mem_opnd(),1);
            }
            else
            {
                if (src_lo->is_mem() || src_hi->is_mem()) {
                    Mem_Operand *m_src = (Mem_Operand*)src_lo;
                    emitter.emit_fld(m_src->mem_opnd(),1);
                    stack.fp_inc_cnt();
                }
            }
            break;
        }
    case JAVA_TYPE_VOID: // void return
        stack.maintain_precise_exception();
        break;
    default:
        assert(0);
        break;
    }
    
    // 
    // restore callee saved registers
    //
    if (frame.base_reg == ebp_reg) {
        // clear ebp in callee saved mask so it isn't later pushed
        callee_saved_regs &= ~callee_saved_ebp_mask;
    }
    
    //
    // pop callee-saved registers
    //
    if (callee_saved_regs & callee_saved_edi_mask)
        emitter.emit_pop(&edi_opnd);
    if (callee_saved_regs & callee_saved_esi_mask)
        emitter.emit_pop(&esi_opnd);
    if (callee_saved_regs & callee_saved_ebp_mask)
        emitter.emit_pop(&ebp_opnd);
    if (callee_saved_regs & callee_saved_ebx_mask)
        emitter.emit_pop(&ebx_opnd);
    
    //
    // restore caller's frame
    //
    if (frame.base_reg == ebp_reg) {
        emitter.emit_mov(&esp_opnd,&ebp_opnd);
        emitter.emit_pop(&ebp_opnd);
    } else if (frame_size > 0) {
        emitter.emit_alu(add_opc,&esp_opnd,&Imm_Opnd(frame_size));
    }
#ifdef _TRACE
    {
        //
        // insert a call to a C function that will print out this method
        // entry trace print out
        //
        emitter.emit_push(&M_Base_Opnd(esp_reg,8));
        emitter.emit_push(&Imm_Opnd((unsigned)method_info->name));
        unsigned patch_offset = emitter.get_offset()+1;
        emitter.emit_call((char*)jim_fun_exit);
        emitter.emit_alu(add_opc,&esp_opnd,&Imm_Opnd(8));
        code_patch_list =
            new(mem_manager) Call_Patch(code_patch_list,patch_offset,(char*)jim_fun_exit);
    }
#endif // _TRACE
    
    
    //
    // return
    //
    emitter.emit_ret(&Imm_Opnd(frame.n_args<<2));
}

void gen_store32(Code_Emitter& emitter,Stack& stack,M_Opnd *opnd,Operand*& src) {
    //
    // note that for 64-bit stores where the source is a mem operand,
    // we need to worry about killing a base or index address register
    // before we reuse the source mem operand.  In the code sequence
    // below, the source mem operand is kept busy until AFTER get_reg()
    // is called.  Therefore, we avoid this danger.  Also, the base/index
    // registers are freed for use by the next half of the 64-bit store.
    //
    if (src->is_mem()) {
        if (src->ty != Operand::T64bit_hi)
            src->free_opnd(&stack.reg_manager);
        Reg_Operand *reg = stack.reg_manager.get_reg();
        src->emit_mov_to_reg(emitter,&reg->opnd);
        src = reg;
    }
    src->emit_mov_to_mem(emitter,opnd);
        
    if (!stack.fp_strict_mode)
    {
        if (src->kind == Operand::Fp) {
            // We already popped top of stack so decrement cnt
            stack.fp_dec_cnt();
#if 0
            for (int i = 0; i < stack.fp_get_cnt()-1; i++) {
                emitter.emit_fstp(0);
            }
            stack.fp_reset_stack();
#endif
        }
    }
    
    src->free_opnd(&stack.reg_manager);
}

static unsigned gen_method_info_size(Jit_Method_Info *method_info, Register_Allocator *regalloc)
{
    // size of Jit_Method_Info
    unsigned size = sizeof(Jit_Method_Info);
    // plus size of cs_info array (first element in Jit_Method_Info)
    if (method_info->num_call_sites > 1)
        size += (method_info->num_call_sites - 1) * sizeof(Call_Site_Info);
    // plus size of Esp_Record arrays
    unsigned num_esp_records = 0;
    unsigned total_type_vector_length = method_info->num_vars;
    for (unsigned i = 0; i < method_info->num_call_sites; i++) {
        num_esp_records += method_info->cs_info[i].num_records;
        total_type_vector_length += method_info->cs_info[i].stack_depth;
    }
    size += num_esp_records * sizeof(struct Esp_Record);
    // plus size of type_vectors
    size += total_type_vector_length;
    size += regalloc->space_in_method_info(method_info);
    
    return size;
}

static Jit_Method_Info* gen_perm_method_info(Jit_Method_Info *old_mi,
                                             Method_Handle method_handle,
                                             Compile_Handle compilation_handle,
                                             unsigned size,
                                             char *code_block,
                                             Register_Allocator *regalloc,
                                             Call_Site_Info *cs_info[])
{
    Jit_Method_Info *new_mi = 
        (Jit_Method_Info *)method_allocate_jit_data_block(method_handle,
        compilation_handle, size);

    // copy common parts of the two method_infos
    unsigned common_size = sizeof(Jit_Method_Info);
    memcpy(new_mi,old_mi,common_size);
    Call_Site_Info *new_cs_info = new_mi->cs_info;
    if (old_mi->num_call_sites > 1) {
        for (unsigned i = 0; i < old_mi->num_call_sites; i++)
            new_cs_info[i] = *cs_info[i];  // memory copy
        common_size += (old_mi->num_call_sites - 1) * sizeof(Call_Site_Info);
    }
    
    // _ptr points to free space following common part of old and new method_infos
    char *_ptr = (((char *)new_mi) + common_size);
    
    // compact Stack_Pointer_Record list into Esp_Record array

    unsigned i;
    for (i = 0; i < new_mi->num_call_sites; i++) {
        Call_Site_Info *csi = &new_cs_info[i];
        csi->ret_IP += (unsigned)code_block;
        csi->call_IP += (unsigned)code_block;
        csi->precall_IP += (unsigned)code_block;
        // get beginning of list
        struct Stack_Pointer_Record *srp = (Stack_Pointer_Record *)csi->esp_record;
        // overwrite beginning of list with new array ptr
        csi->esp_record = (struct Esp_Record *)_ptr;
        _ptr += csi->num_records * sizeof(Esp_Record);
        // for each record, copy contents into compacted array
        for (unsigned j = 0; j < csi->num_records; j++) {
            csi->esp_record[j].offset = srp->offset + (unsigned)code_block;
            csi->esp_record[j].args_on_stack = srp->args_on_stack;
            srp = srp->next;
        }
        assert(srp == NULL);
        
    }
    
    // copy type vectors
    new_mi->var_type_vector = _ptr;
    for (i = 0; i < new_mi->num_vars; i++)
        *_ptr++ = old_mi->var_type_vector[i];
    for (i = 0; i < new_mi->num_call_sites; i++) {
        Call_Site_Info *csi = &new_cs_info[i];
        csi->type_vector = _ptr;
        csi->type_vector_length = csi->stack_depth;
        for (unsigned j = 0; j < csi->type_vector_length; j++)
            *_ptr++ = cs_info[i]->type_vector[j];
    }
    
    regalloc->set_call_site_info(new_mi, size);
    regalloc->set_register_saved_info(new_mi);
    _ptr += regalloc->space_in_method_info(new_mi);
    
    assert (_ptr == (((char *)new_mi) + gen_method_info_size(old_mi,regalloc)));
    
#if 0  // Don't delete -- may be useful someday.  (JMS)
    // method_info
    cout << endl << "method_info:" << endl;
    cout << "name                   = " << (char *)new_mi->name << endl;
    
    cout << "cnt                    = " << (void *)new_mi->cnt << endl;
    cout << "num_spills             = " << (void *)new_mi->num_spills << endl;
    cout << "num_in_args            = " << (void *)new_mi->num_in_args << endl;
    cout << "num_vars               = " << (void *)new_mi->num_vars << endl;
    cout << "num_callee_saved_regs  = " << (void *)new_mi->num_callee_saved_regs << endl;
    cout << "num_call_sites         = " << (void *)new_mi->num_call_sites << endl << endl;
    cout << "var_type_vector        = " << (void *)new_mi->var_type_vector << endl;
    cout << "                       = ";
    for (i = 0; i < new_mi->num_vars; i++)
        cout << *(new_mi->var_type_vector + i);
    cout << endl;
    // call sites
    for (unsigned j = 0; j < new_mi->num_call_sites; j++) {
        Call_Site_Info *csi = &new_mi->cs_info[j];
        cout << "call site " << j << " info:" << endl;
        cout << "  stack_depth        = " << (void *)csi->stack_depth << endl;
        cout << "  num_out_args       = " << (void *)csi->num_out_args << endl;
        cout << "  ret_IP             = " << (void *)csi->ret_IP << endl;
        cout << "  call_IP            = " << (void *)csi->call_IP << endl;
        cout << "  num_records        = " << (void *)csi->num_records << endl;
        cout << "  esp_record         = " << (void *)csi->esp_record << endl;
        // esp record
        for (unsigned j = 0; j < csi->num_records; j++) {
            cout << "      record #" << j << ": offset = " << (void*)csi->esp_record[j].offset;
            cout << "; args_on_stack = " << (void*)csi->esp_record[j].args_on_stack << endl;
        }
        // type vector
        cout << "  type_vector_length = " << (void *)csi->type_vector_length << endl;
        cout << "  type_vector        = " << (void *)csi->type_vector << endl;
        cout << "                     = ";
        for (i = 0; i < csi->type_vector_length; i++)
            cout << *(csi->type_vector + i);
        cout << endl;
    }
    cout << endl;
#endif	// 0
    
    return new_mi;
}


void make_esp_record (unsigned code_offset,
                      unsigned num_args_on_stack,
                      Jit_Method_Info *method_info,
                      Mem_Manager &mem_manager)
{
    struct Stack_Pointer_Record *rec =
        (struct Stack_Pointer_Record *)mem_manager.alloc(sizeof(struct Stack_Pointer_Record));
    rec->offset = code_offset;
    rec->args_on_stack = num_args_on_stack;
    rec->next=(struct Stack_Pointer_Record *)method_info->cs_info[method_info->cnt].esp_record;
    method_info->cs_info[method_info->cnt].esp_record = (struct Esp_Record *)rec;
    method_info->cs_info[method_info->cnt].num_records++;
}

static void L1a_dump_jit(Method_Handle method_handle,
                         const unsigned char *bytecode_start,
                         unsigned bytecode_length,
                         Map_Entry map[],
                         CG_Prepass& prepass,
                         const char *method_name,
                         char *x86_code,
                         unsigned x86_code_length)
{
    char buf[80000];
    
    Class_Handle ch = method_get_class(method_handle);
    FILE *_f = acquire_dump_jit_file(ch);
    
    fprintf(_f, "Method %s.%s%s\n",
        class_get_name(ch), method_get_name(method_handle), method_get_descriptor(method_handle));
    
#ifdef TRACE_LEVEL1A
    cout << "Dump_jit for " << class_get_name(ch) << "."
        << method_get_name(method_handle) << "." << method_get_descriptor(method_handle) << endl;
#endif
    
#ifdef SHORT_DUMP_JIT
    fprintf(_f, "Generated code: %X .. %X\n\n",
        x86_code,
        x86_code + x86_code_length);
    release_dump_jit_file();
    return;
#endif
    
    char *bytestream;
    unsigned processed = 0;
    
    bytestream = x86_code;
    
    unsigned first_offset = map[0].offset;
    
    do {
        char *old = bytestream;
        bytestream = x86_disasm(bytestream, buf, true, true);
        processed += (bytestream - old);
        fprintf(_f, "\t%s\n", buf);
    } while (processed < first_offset);
    
    unsigned bc_ip = 0;
    const unsigned char *bc = bytecode_start;
    
    
    char *dis_bc(const Byte *bc_start, unsigned *bc_ip, char *buf, Method_Handle m);
    
    bc_ip = 0;
    int bb_num = 0;
    while(bc_ip < bytecode_length) {
        char *x86_start = x86_code + map[bc_ip].offset;
        
        //
        // print basic block
        //
        Bytecode_Info *bi = prepass.bytecode_info(bc_ip);
        if (bi->attr.is_block_entry) {
            fprintf(_f, "// BB#%d:", bb_num++);
            if (bi->attr.is_exception_handler_entry) {
                fprintf(_f, " (* exception handler *)\n");
            } else {
                if(bi->attr.depth)
                    fprintf(_f, " (non-zero stack depth: %d)", bi->attr.depth);
                fprintf(_f, "\n");
            }
        }
        
        dis_bc(bytecode_start, &bc_ip, buf, method_handle);
        fprintf(_f, "%s\n", buf);
        
        //
        // Decode a sequence of X86 code
        //
        char *x86_end   = x86_code +
            ( (bc_ip < bytecode_length) ?
            map[bc_ip].offset : x86_code_length );
        
        while(x86_start < x86_end) {
            x86_start = x86_disasm(x86_start, buf, true, true);
            fprintf(_f, "\t%s\n", buf);
        }
        
    }
    
    
    fprintf(_f, "End of Method %s.%s%s\n\n",
        class_get_name(ch), method_get_name(method_handle), method_get_descriptor(method_handle));
    
    release_dump_jit_file();
} //L1a_dump_jit

//###
static void emitter_native_code(Method_Handle method_handle,
                         const unsigned char *bytecode_start,
                         unsigned bytecode_length,
                         Map_Entry map[],
                         CG_Prepass& prepass,
                         const char *method_name,
                         char *x86_code,
                         unsigned x86_code_length)
{
    char buf[80000];
    
    Class_Handle ch = method_get_class(method_handle);
    FILE *_f = emitter_offset_fp;
    
    fprintf(_f, "Method %s.%s%s\n",
        class_get_name(ch), method_get_name(method_handle), method_get_descriptor(method_handle));
	fprintf(_f, "// BB#0:\n");

#ifdef TRACE_LEVEL1A
    cout << "Dump_jit for " << class_get_name(ch) << "."
        << method_get_name(method_handle) << "." << method_get_descriptor(method_handle) << endl;
#endif
    
#ifdef SHORT_DUMP_JIT
    fprintf(_f, "Generated code: %X .. %X\n\n",
        x86_code,
        x86_code + x86_code_length);
    release_dump_jit_file();
    return;
#endif
    
    char *bytestream;
    unsigned processed = 0;
    
    bytestream = x86_code;
    
    unsigned first_offset = map[0].offset;
    
    do {
        char *old = bytestream;
        bytestream = x86_disasm(bytestream, buf, true, true);
        processed += (bytestream - old);
        buf[54] = 0;
		fprintf(_f, "\t%s\n", buf);
    } while (processed < first_offset);
    
    unsigned bc_ip = 0;
    const unsigned char *bc = bytecode_start;
    
    
    char *dis_bc(const Byte *bc_start, unsigned *bc_ip, char *buf, Method_Handle m);
    
    bc_ip = 0;
    int bb_num = 0;
    while(bc_ip < bytecode_length) {
        char *x86_start = x86_code + map[bc_ip].offset;
        
        //
        // print basic block
        //
        Bytecode_Info *bi = prepass.bytecode_info(bc_ip);
        if (bi->attr.is_block_entry) {
            fprintf(_f, "// BB#%d:", (bb_num++)+1);
            if (bi->attr.is_exception_handler_entry) {
                fprintf(_f, " (* exception handler *)\n");
            } else {
                if(bi->attr.depth)
                    fprintf(_f, " (non-zero stack depth: %d)", bi->attr.depth);
                fprintf(_f, "\n");
            }
        }
        
        dis_bc(bytecode_start, &bc_ip, buf, method_handle);
//        fprintf(_f, "%s\n", buf);
        
        //
        // Decode a sequence of X86 code
        //
        char *x86_end   = x86_code +
            ( (bc_ip < bytecode_length) ?
            map[bc_ip].offset : x86_code_length );
        
        while(x86_start < x86_end) {
            x86_start = x86_disasm(x86_start, buf, true, true);
            buf[54] = 0;
			fprintf(_f, "\t%s\n", buf);
        }
        
    }
    
    
//    fprintf(_f, "End of Method %s.%s%s\n\n",
//        class_get_name(ch), method_get_name(method_handle), method_get_descriptor(method_handle));
   
} 
//###

JIT_Result	select_code(
                        Mem_Manager&	mem_manager,
                        Code_Emitter&	emitter,
                        CG_Prepass&		prepass,
                        Frame&          frame,
                        Stack&          stack,
                        Pre_Alloc_Operand_Pool&	operand_pool,
                        Compile_Handle	compilation_handle,
                        Class_Handle	class_handle,
                        Method_Handle	method_handle,
                        const BYTE		*first_bc,		// first bytecode
                        size_t			code_len,
                        JIT_Flags		flags,
                        Jit_Method_Info	*method_info,
                        Register_Allocator *regalloc,
                        Profile_Rec     *prof_rec,
                        Dbg             *dbg_support,
                        CODE_MI         code_mi
#ifdef VTune_Support
                        , iJIT_Method_Load *mInfo
#endif // VTune_Support
                        ) {
    int VMresFailed = 0;
    
    unsigned index; // index of constant pools or variables
    Operand *src, *src_lo, *src_hi; // source operands
    //
    // code block that has been previously generated (Lazy method info)
    //
    Small_Method_Info *smi = NULL;
    char *m_code = NULL;   // original code block
    unsigned *class_initializer = NULL;
    unsigned num_get_put_static = 0;
    if (code_mi == cm_gen_method) {
        smi = (Small_Method_Info *)method_get_info_block(method_handle,o1_jit);
        m_code = (char*)method_get_code_block_addr(method_handle,o1_jit);
        class_initializer = smi->class_initializer;
    } else {
        unsigned bv_wd = prepass.num_get_put_static/(sizeof(unsigned)*8)+1;
        unsigned sz = sizeof(unsigned)*bv_wd;
        class_initializer = (unsigned*)mem_manager.alloc(sz);
        for (unsigned i = 0; i < bv_wd; i++)
            class_initializer[i] = 0;
    }
    //
    // immediate operand
    //
    Imm_Opnd imm(0);
    //
    // map data structure
    //
    Map_Entry *map = (Map_Entry *)mem_manager.alloc(sizeof(Map_Entry)*code_len);
    memset((char*)map,'\0',sizeof(Map_Entry)*code_len);
    //
    // Patches for profiling code
    //
    Profile_Patch prof_patch(mem_manager, statistics ? prepass.num_blocks + 1 : prepass.num_entries_back_edge+1 );
    //
    // list of code and data patches for this method
    //
    Code_Patch *code_patch_list = NULL;
    //
    // list of switch table entry patches
    //
    Table_Entry_Patch *table_entry_patch_list = NULL;
    //
    // emitters for the read-only and read-write data blocks
    // the read-only data block is for float-point constants and switch tables
    // the read-write data block is for invokeinterface hints
    //
    Data_Emitter ro_data_emitter(mem_manager,prepass.ro_data_size());
    Data_Emitter rw_data_emitter(mem_manager,prepass.rw_data_size());
    
    const char *method_name = method_get_name(method_handle);
    const char *class_name = class_get_name(class_handle);
    DWORD method_flags = method_get_flags(method_handle);
    
#ifdef _TRACE
    //
    // insert a call to a C function that will print out this method
    // entry trace print out
    //
    emitter.emit_push(&M_Base_Opnd(esp_reg,8));
    emitter.emit_push(&Imm_Opnd((unsigned)method_name));
    emitter.emit_push(&Imm_Opnd((unsigned)class_name));
    unsigned patch_offset = emitter.get_offset()+1;
    emitter.emit_call((char*)jim_fun_entry);
    emitter.emit_alu(add_opc,&esp_opnd,&Imm_Opnd(12));
    code_patch_list =
        new(mem_manager) Call_Patch(code_patch_list,patch_offset,(char*)jim_fun_entry);
#endif // _TRACE
    
    //
    // emit prolog
    //
    unsigned callee_saved_regs_mask = 0;
    int ebp_based = !method_info->is_esp_based;
    callee_saved_regs_mask = regalloc->callee_saved_regs(ebp_based);
    
    emit_prolog(emitter,frame,callee_saved_regs_mask,method_handle,mem_manager,
        prepass,code_patch_list,method_info,regalloc,
        method_name,stack,prof_rec,prof_patch);
    
#ifndef NO_BOUNDS_CHECKING
    Bounds_Checking bounds;
#endif // NO_BOUNDS_CHECKING
    
    stack.mark_first_bc(first_bc);
    const unsigned char *last_bc = first_bc + code_len; // last bytecode
    const unsigned char *bc = first_bc;
    const unsigned char *prev_bc;
    const unsigned char *curr_bc = first_bc;	// ptr to current bytecode
    int   bb_idx_hint = 0;
    unsigned be_entry = 0;
	//
	// The array offset_buf is used to record the instrument-offset and the counter-address 
	//  for statistics purpose. The size of the array should be large enough
	//
	unsigned offset_buf[10000] ;
	unsigned offset_buf_offset = 0 ;
	//
	// For convience, we record the offset_buf and offset_buf_offset in emitter
	// also prof_rec
	//
	emitter.offset_buf = offset_buf ;
	emitter.offset_buf_offset = 0 ;
	emitter.prof_rec = prof_rec ;
	emitter.inner_bb_cnt_offset = 0 ;
	b_inner_counter = false ;

	Bit_Vector *bv_visited = method_info->is_visited;
    while (bc < last_bc) {
        prev_bc = curr_bc;
        curr_bc = bc;
        unsigned bc_index = curr_bc-first_bc; // offset of current bytecode
        unsigned char bytecode = *bc++;	// value of current bytecode
		if(!bv_visited->is_set(bc_index))
			continue;

		//
        // if dbg_support is not NULL, we want to know the code offset
        // of a bytecode location.  As soon as we figure out the offset,
        // we immediately return.
        //
        if (dbg_support != NULL)
        {
            if (bc_index == dbg_support->get_bc_location())
            {
                dbg_support->set_code_offset(emitter.get_offset());
                return JIT_FAILURE;
            }
            //
            // the bc_location is not a legeal bytecode offset
            //
            else if (bc_index > dbg_support->get_bc_location())
                return JIT_FAILURE;
        }

        Branch_Patch *patch_list = NULL;
        Bytecode_Info *bytecode_info = prepass.bytecode_info(bc_index);
        //
        // init mimic stack at the entry of block
        //
        if (bytecode_info->attr.is_block_entry) {
            stack.home_all(); // make sure all operands are spilled at the end of bb
            stack.reset();
#ifdef LOCAL_CALLEE
            unsigned char leftover_regs = regalloc->registers_available_in_codegen_bb(curr_bc, bb_idx_hint);			
            stack.reg_manager.add_free_callee_regs(leftover_regs);
#ifdef _CSE 
            stack.cse->inc_cse_reg_upto(leftover_regs);
#endif // _CSE
#endif // LOCAL_CALLEE
            stack.mark_curr_bc(NULL);
            for (unsigned i=0; i < bytecode_info->attr.depth; i++)
                stack.push(operand_pool.nth_stack(i));
#ifndef NO_BOUNDS_CHECKING
            bounds.reset();
#endif // NO_BOUNDS_CHECKING
        }
        stack.mark_curr_bc(curr_bc);
        //
        // check the patch_list
        //
        patch_list = map[bc_index].patch;
        map[bc_index].offset = emitter.get_offset();
        
        //
		// emit instrumenting code for every BB, if statistics.
		// Otherwise, emit instrumenting code for back_edge
        //
        if (instrumenting){
			if(statistics && bytecode_info->attr.is_block_entry){// use the old variables, although the var-names are some strange...
				unsigned n_bb = prepass.num_blocks ;
				assert(be_entry < n_bb) ;
				inserting_instrumenting_code(emitter,prof_patch/*for statistics, this var never used*/,
					bc_index,(unsigned*)&prof_rec->back_edge[be_entry]);
				((unsigned short*)&prof_rec->back_edge[n_bb])[be_entry] = bc_index;
				be_entry++;
			}else if(bytecode_info->attr.is_block_entry &&
				bytecode_info->attr.is_back_edge_entry) {
				unsigned n_be = prepass.num_entries_back_edge;
				assert(be_entry < n_be);
				inserting_instrumenting_code(emitter,prof_patch,
					bc_index,(unsigned*)&prof_rec->back_edge[be_entry]);
				((unsigned short*)&prof_rec->back_edge[n_be])[be_entry] = bc_index;
				be_entry++;
			}
        }
        
        if (bytecode_info->attr.is_exception_handler_entry) {
            //
            // On the entry of exception handler, the stack contains only
            // one exception object (stack depth=1).  We need to emit code
            // to pop out this and store in nth_stack(0)
            //
#if 1 // JMS
            // At an exception handler, add one call_site record for GC.
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            method_info->cs_info[method_info->cnt].precall_IP = (unsigned)emitter.get_offset();
            method_info->cs_info[method_info->cnt].returns_ref = 0;
            method_info->cs_info[method_info->cnt].outarg_bv = 0;
            method_info->cs_info[method_info->cnt].m_handle = NULL;
            method_info->cs_info[method_info->cnt].ret_IP = (unsigned)emitter.get_offset();
            method_info->cnt ++;
#endif // 1
            unsigned offset = (stack.size - stack.depth() + frame.n_callee) << 2;
            emitter.emit_mov(&M_Base_Opnd(esp_reg,offset),&eax_opnd);
        }
#ifdef _CSE
        stack.cse->reset_popped_srcs();
        X86_Reg_No cse_reg = stack.cse->find_cse(first_bc,last_bc,curr_bc);
        if ( cse_reg != n_reg && 
            (stack.reg_manager.is_free(cse_reg) ||
            stack.reg_manager.has_available_reg())) {
            int cse_len = stack.cse->reg_exp(cse_reg)->len();
            //
            // for dump_jit.  
            // If we don't map cse to emitter.get_offset(), the output of dump_jit
            // is wrong.
            //
            stack.cse->map_offset(map,emitter.get_offset(),first_bc,curr_bc,cse_len);
            bc = curr_bc + cse_len;
            // if cse_reg is free, then go ahead and use it
            if (stack.reg_manager.is_free(cse_reg))
                stack.push_cse(stack.reg_manager.hold_cse_reg(cse_reg),cse_len);
            else { 
                // if cse_reg is not free, we try to find other free reg so that we
                // can move cse_reg to reg. We give up if there is no other free reg
                // because it is hard to justify that spilling is better than 
                // recomputing the cse expression.
                Reg_Operand *reg = stack.reg_manager.get_reg();
                Reg_Operand(cse_reg).emit_mov_to_reg(emitter,&reg->opnd);
                stack.push_cse(reg,cse_len);
            }
            continue;
        }
#endif // _CSE
        unsigned depth = stack.depth(); // the current stack depth

		//
		//For statistics, for some inner bb bytecodes, we need one more counter
		//
		if(inner_statistics && b_inner_counter && !bytecode_info->attr.is_block_entry){
			assert(emitter.prof_rec) ;
			inner_bb_instrumenting_code(emitter,
				(unsigned*)&((PROF_COUNTER*)&((unsigned short*)&emitter.prof_rec->back_edge[emitter.prof_rec->n_back_edge])[emitter.prof_rec->n_back_edge])[emitter.inner_bb_cnt_offset++]);
		}
		b_inner_counter = false ;
        switch (bytecode) {
        case 0x00:	break;		// nop
            //
            // constant loads
            //
        case 0x01:			// aconst_null
            stack.push(operand_pool.imm(0));
            break;
        case 0x02: case 0x03: case 0x04: case 0x05:
        case 0x06: case 0x07: case 0x08:
            // iconst -1,0,...,5
            stack.push(operand_pool.imm(bytecode-0x03));
            break;
        case 0x09: case 0x0a:
            // lconst 0,1
            stack.push64(operand_pool.imm(bytecode-0x09),
                operand_pool.imm(0));
            break;
        case 0x0b:	// fconst 0.0F
            stack.push(new(mem_manager) Static_Operand(&FCONST0,1));
            break;
        case 0x0c:	// fconst 1.0F
            stack.push(new(mem_manager) Static_Operand(&FCONST1,1));
            break;
        case 0x0d:	// fconst 2.0F
            stack.push(new(mem_manager) Static_Operand(&FCONST2,1));
            break;
        case 0x0e:	// dconst 0.0
            {
                char *p = (char*)&DCONST0;
                stack.push64(new(mem_manager) Static_Operand(p,1),
                             new(mem_manager) Static_Operand(p+4,1));
            }
            break;
        case 0x0f:	// dconst 1.0
            {
                char *p = (char*)&DCONST1;
                stack.push64(new(mem_manager) Static_Operand(p,1),
                             new(mem_manager) Static_Operand(p+4,1));
            }
            break;
            //
            // stack pushes
            //
        case 0x10: 		// bipush
            stack.push(new(mem_manager) Imm_Operand(*(char*)bc++));
            break;
        case 0x11:		// sipush
            stack.push(new(mem_manager) Imm_Operand((*(char*)bc << 8) + bc[1]));
            bc += 2;
            break;
        case 0x12:						// ldc
            //
            // load constant from constant pool
            //
            index = *bc++;
            if (method_info->cnt < method_info->num_call_sites)
                method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_ldc(ro_data_emitter,emitter,stack,mem_manager,code_patch_list,class_handle,index,method_info);
            break;
        case 0x13:			// ldc_w
            index = (*bc << 8) + bc[1];
            bc += 2;
            if (method_info->cnt < method_info->num_call_sites)
                method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_ldc(ro_data_emitter,emitter,stack,mem_manager,code_patch_list,class_handle,index,method_info);
            break;
        case 0x14:			// ldc2_w
            index = (*bc << 8) + bc[1];
            bc += 2;
            gen_ldc(ro_data_emitter,emitter,stack,mem_manager,code_patch_list,class_handle,index,method_info);
            break;
        case 0x15:	// iload
            index = *bc++;
            gen_int_load(mem_manager,emitter,stack,method_info,
                regalloc,
                frame,index,0);
            break;
        case 0x17:	// fload
            index = *bc++;
            stack.push(new(mem_manager) Mem_Var_Operand(frame,index,0));
            break;
        case 0x19:	// aload
            index = *bc++;
            gen_int_load(mem_manager,emitter,stack,method_info,
                regalloc,
                frame,index,1);
            break;
        case 0x16:	// lload
            {
                index = *bc++;
                X86_Reg_No reg_lo, reg_hi;
                if ((reg_lo = regalloc->reg_var_allocated_to(index+1, NULL/*XXX-hack*/)) != n_reg &&
                    (reg_hi = regalloc->reg_var_allocated_to(index, NULL/*XXX-hack*/)) != n_reg)
                {
                    stack.push64(stack.reg_manager.get_reg(reg_lo),
                                 stack.reg_manager.get_reg(reg_hi));
                    break;
                }
                reg_lo = stack.reg_manager.find_reg_value(index+1);
                reg_hi = stack.reg_manager.find_reg_value(index);
                if (reg_lo != n_reg && reg_hi != n_reg) // for LOAD_STORE
                    stack.push64(stack.reg_manager.get_reg(reg_lo),
                                 stack.reg_manager.get_reg(reg_hi));
                else
                    stack.push64(new(mem_manager) Mem_Var_Operand(frame,index+1,0),
                                 new(mem_manager) Mem_Var_Operand(frame,index,0));
            }
            break;
        case 0x18:	// dload
            index = *bc++;
            load_double(emitter,frame,stack,mem_manager,index);
            break;
        case 0x1a: case 0x1b: case 0x1c: case 0x1d:	// iload_{0,1,2,3}
            index = bytecode-0x1a;
            gen_int_load(mem_manager,emitter,stack,method_info,
                regalloc,
                frame,index,0);
            break;
        case 0x22: case 0x23: case 0x24: case 0x25:	// fload_{0,1,2,3}
            index = bytecode-0x22;
            stack.push(new(mem_manager) Mem_Var_Operand(frame,index,0));
            break;
        case 0x2a: case 0x2b: case 0x2c: case 0x2d:	// aload_{0,1,2,3}
            index = (bytecode-0x2a);
            gen_int_load(mem_manager,emitter,stack,method_info,
                regalloc,
                frame,index,1);
            break;
        case 0x1e: case 0x1f: case 0x20: case 0x21:	// lload_{0,1,2,3}
            {
                index = (bytecode-0x1e)&0x03;
                X86_Reg_No reg_lo, reg_hi;
                if ((reg_lo = regalloc->reg_var_allocated_to(index+1, NULL/*XXX-hack*/)) != n_reg &&
                    (reg_hi = regalloc->reg_var_allocated_to(index, NULL/*XXX-hack*/)) != n_reg)
                {
                    stack.push64(stack.reg_manager.get_reg(reg_lo),
                                 stack.reg_manager.get_reg(reg_hi));
                    break;
                }
                reg_lo = stack.reg_manager.find_reg_value(index+1);
                reg_hi = stack.reg_manager.find_reg_value(index);
                if (reg_lo != n_reg && reg_hi != n_reg) // for LOAD_STORE
                    stack.push64(stack.reg_manager.get_reg(reg_lo),
                                 stack.reg_manager.get_reg(reg_hi));
                else
                    stack.push64(new(mem_manager) Mem_Var_Operand(frame,index+1,0),
                                 new(mem_manager) Mem_Var_Operand(frame,index,0));
            }
            break;
        case 0x26: case 0x27: case 0x28: case 0x29:	// dload_{0,1,2,3}
            index = (bytecode-0x1e)&0x03;
            load_double(emitter,frame,stack,mem_manager,index);
            break;
            //
            // array load
            //
        case 0x2e:	// iaload
        case 0x2f:	// laload
        case 0x30:	// faload
        case 0x31:	// daload
        case 0x32:	// aaload
        case 0x33:	// baload
        case 0x34:	// caload
        case 0x35:	// saload
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            emit_array_load(emitter,stack,mem_manager,"IJFDLBCS"[bytecode - 0x2e],
                code_patch_list,method_handle,bc_index,method_info
                , regalloc
#ifndef NO_BOUNDS_CHECKING
                ,bounds
#endif // NO_BOUNDS_CHECKING
                );
            break;
        case 0x3a:	// astore
            index = *bc++;
            gen_astore(emitter,stack,prepass,method_info,
                regalloc,
                frame,index,curr_bc
#ifndef NO_BOUNDS_CHECKING
                ,bounds
#endif // NO_BOUNDS_CHECKING
                );
            break;
        case 0x36:	// istore
            index = *bc++;
            gen_int_store(emitter,stack,prepass,method_info,
                regalloc,
                frame,index);
            break;
        case 0x38:	// fstore
            index = *bc++;
            if(IS_INVALID_RV_BITMAP(method_info->ref_var_bitmap))
                GC_Tags_emit_clear(frame,emitter,prepass,index);
            src = stack.pop();
            stack.no_laziness(frame,index,n_reg);
            gen_store32(emitter,stack,&M_Var_Opnd(frame,index),src);
            break;
        case 0x37:	// lstore
            index = *bc++;
            if(IS_INVALID_RV_BITMAP(method_info->ref_var_bitmap)) {
                GC_Tags_emit_clear(frame,emitter,prepass,index);
                GC_Tags_emit_clear(frame,emitter,prepass,index+1);
            }
            X86_Reg_No reg_lo, reg_hi;
            if ((reg_lo = regalloc->reg_var_allocated_to(index+1, NULL/*XXX-hack*/)) != n_reg &&
                (reg_hi = regalloc->reg_var_allocated_to(index, NULL/*XXX-hack*/)) != n_reg)
            {
                stack.pop64(src_lo, src_hi);
                stack.no_laziness(frame,index,reg_lo,reg_hi);
                gen_reg_store32(emitter,stack,reg_hi,src_hi);
                gen_reg_store32(emitter,stack,reg_lo,src_lo);
                break;
            }
            stack.pop64(src_lo,src_hi);
            stack.no_laziness(frame,index,n_reg,n_reg);
            gen_store32(emitter,stack,&M_Var_Opnd(frame,index),src_hi);
            gen_store32(emitter,stack,&M_Var_Opnd(frame,index+1),src_lo);
            stack.reg_manager.reg_value(src_hi,index); // for LOAD_STORE
            stack.reg_manager.reg_value(src_lo,index+1); // for LOAD_STORE
            break;
            // JMS: lstore was originally the same as dstore.
        case 0x39:	// dstore
            index = *bc++;
            if(IS_INVALID_RV_BITMAP(method_info->ref_var_bitmap)) {
                GC_Tags_emit_clear(frame,emitter,prepass,index);
                GC_Tags_emit_clear(frame,emitter,prepass,index+1);
            }
            stack.pop64(src_lo,src_hi);
            stack.no_laziness(frame,index,n_reg,n_reg);
            store_double(emitter,frame,stack,index,src_lo,src_hi);
            break;
            
        case 0x4b: case 0x4c: case 0x4d: case 0x4e:	// astore_{0,1,2,3}
            index = (bytecode-0x4b);
            gen_astore(emitter,stack,prepass,method_info,
                regalloc,
                frame,index,curr_bc
#ifndef NO_BOUNDS_CHECKING
                ,bounds
#endif // NO_BOUNDS_CHECKING
                );
            break;
        case 0x3b: case 0x3c: case 0x3d: case 0x3e:	// istore_{0,1,2,3}
            index = bytecode-0x3b;
            gen_int_store(emitter,stack,prepass,method_info,
                regalloc,
                frame,index);
            break;
        case 0x43: case 0x44: case 0x45: case 0x46: // fstore_{0,1,2,3}
            index = bytecode-0x43;
            if(IS_INVALID_RV_BITMAP(method_info->ref_var_bitmap))
                GC_Tags_emit_clear(frame,emitter,prepass,index);
            src = stack.pop();
            stack.no_laziness(frame,index,n_reg);
            gen_store32(emitter,stack,&M_Var_Opnd(frame,index),src);
            break;
        case 0x3f: case 0x40: case 0x41: case 0x42: // lstore_{0,1,2,3}
            
            index = (bytecode-0x3f)&0x03;
            if(IS_INVALID_RV_BITMAP(method_info->ref_var_bitmap)) {
                GC_Tags_emit_clear(frame,emitter,prepass,index);
                GC_Tags_emit_clear(frame,emitter,prepass,index+1);
            }
            if ((reg_lo = regalloc->reg_var_allocated_to(index+1, NULL/*XXX-hack*/)) != n_reg &&
                (reg_hi = regalloc->reg_var_allocated_to(index, NULL/*XXX-hack*/)) != n_reg)
            {
                stack.pop64(src_lo, src_hi);
                stack.no_laziness(frame,index,reg_lo,reg_hi);
                gen_reg_store32(emitter,stack,reg_hi,src_hi);
                gen_reg_store32(emitter,stack,reg_lo,src_lo);
                break;
            }
            stack.pop64(src_lo,src_hi);
            stack.no_laziness(frame,index,n_reg,n_reg);
            gen_store32(emitter,stack,&M_Var_Opnd(frame,index),src_hi);
            gen_store32(emitter,stack,&M_Var_Opnd(frame,index+1),src_lo);
            stack.reg_manager.reg_value(src_hi,index); // for LOAD_STORE
            stack.reg_manager.reg_value(src_lo,index+1); // for LOAD_STORE
            break;
            // JMS: lstore and dstore were originally the same.
        case 0x47: case 0x48: case 0x49: case 0x4a:	// dstore_{0,1,2,3}
            index = (bytecode-0x47)&0x03;
            if(IS_INVALID_RV_BITMAP(method_info->ref_var_bitmap)) {
                GC_Tags_emit_clear(frame,emitter,prepass,index);
                GC_Tags_emit_clear(frame,emitter,prepass,index+1);
            }
            stack.pop64(src_lo,src_hi);
            stack.no_laziness(frame,index,n_reg,n_reg);
            store_double(emitter,frame,stack,index,src_lo,src_hi);
            break;
            
        case 0x4f:	// iastore
        case 0x50:	// lastore
        case 0x51:	// fastore
        case 0x52:	// dastore
        case 0x53:	// aastore
        case 0x54:	// bastore
        case 0x55:	// castore
        case 0x56:	// sastore
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            emit_array_store(mem_manager,emitter,stack,operand_pool,"IJFDLBCS"[bytecode - 0x4f],
                code_patch_list,method_handle,
                regalloc,
                bc_index,method_info
#ifndef NO_BOUNDS_CHECKING
                ,bounds
#endif // NO_BOUNDS_CHECKING
                );
#ifdef _CSE
            stack.cse->kill_reg_array_cse(bc_index,bytecode - 0x4f);
#endif // _CSE
            break;
            //
            // stack operations
            //
        case 0x57:	// pop
            src = stack.pop();
            src->free_opnd(&stack.reg_manager);

            if (src->kind == Operand::Fp && !stack.fp_strict_mode) {
                emitter.emit_fstp(0);
                stack.fp_dec_cnt();
            }
            break;
        case 0x58:	// pop2
            stack.pop64(src_lo,src_hi);
            src_lo->free_opnd(&stack.reg_manager);
            src_hi->free_opnd(&stack.reg_manager);

            if (src_lo->kind == Operand::Fp && !stack.fp_strict_mode) {
                emitter.emit_fstp(0);
                stack.fp_dec_cnt();
            }
            break;
        case 0x59: // dup
            // push the stack top
#ifndef NO_BOUNDS_CHECKING
            gen_dup(mem_manager,emitter,stack,operand_pool,bounds);
#else // NO_BOUNDS_CHECKING
            gen_dup(mem_manager,emitter,stack,operand_pool);
#endif // NO_BOUNDS_CHECKING
            break;
        case 0x5a:	// dup_x1
            gen_dup_x1(mem_manager,emitter,stack,operand_pool);
            break;
        case 0x5b: 	// dup_x2
            gen_dup_x2(emitter,stack,operand_pool);
            break;
        case 0x5c:	// dup2
            gen_dup2(emitter,stack,operand_pool);
            break;
        case 0x5d:	// dup2_x1
            gen_dup2_x1(emitter,stack,operand_pool);
            break;
        case 0x5e:	// dup2_x2
            gen_dup2_x2(emitter,stack,operand_pool);
            break;
        case 0x5f:  // swap
            // swap top two operands
            {
                Reg_Operand *reg;
                Operand *word1 = stack.pop();
                Operand *word2 = stack.pop();
                int curr_depth = stack.depth();
                if (word1->kind == Operand::Stk) {
                    reg = stack.reg_manager.get_reg();
                    word1->emit_mov_to_reg(emitter,&reg->opnd);
                    word1 = reg;
                }
                if (word2->kind == Operand::Stk) {
                    reg = stack.reg_manager.get_reg();
                    word2->emit_mov_to_reg(emitter,&reg->opnd);
                    word2 = reg;
                }
                stack.push(word1);
                stack.push(word2);
            }
            break;
            //
            // arithmetic operations
            //
            //
            // add
            //
        case 0x60:	// iadd
            emit_alu32(mem_manager,emitter,stack,add_opc,1);
            break;
        case 0x61:	// ladd
            emit_alu64(mem_manager,emitter,stack,operand_pool,add_opc,adc_opc,1);
            break;
        case 0x62:	// fadd
            gen_fp(emitter,fadd_opc,false,stack,mem_manager,frame,operand_pool);
            break;
        case 0x63:	// dadd
            gen_fp(emitter,fadd_opc,true,stack,mem_manager,frame,operand_pool);
            break;
            //
            // sub
            //
        case 0x64:	// isub
            emit_alu32(mem_manager,emitter,stack,sub_opc,0);
            break;
        case 0x65:	// lsub
            emit_alu64(mem_manager,emitter,stack,operand_pool,sub_opc,sbb_opc,0);
            break;
        case 0x66:	// fsub
            gen_fp(emitter,fsubr_opc,false,stack,mem_manager,frame,operand_pool);
            break;
        case 0x67:	// dsub
            gen_fp(emitter,fsubr_opc,true,stack,mem_manager,frame,operand_pool);
            break;
            //
            // mul
            //
        case 0x68:	// imul
            gen_imul(mem_manager,emitter,stack);
            break;
        case 0x69:	// lmul
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_long_help_func(mem_manager,emitter,stack,code_patch_list,
                frame,ORP_RT_LMUL,method_info);
            break;
        case 0x6a:	// fmul
            gen_fp(emitter,fmul_opc,false,stack,mem_manager,frame,operand_pool);
            break;
        case 0x6b:	// dmul
            gen_fp(emitter,fmul_opc,true,stack,mem_manager,frame,operand_pool);
            break;
            //
            //  div
            //
        case 0x6c:	// idiv
            gen_idiv(mem_manager,emitter,stack,operand_pool,1); // get quotient
            break;
        case 0x6d:	// ldiv
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_long_help_func(mem_manager,emitter,stack,code_patch_list,
                frame,ORP_RT_LDIV,method_info);
            break;
        case 0x6e:	// fdiv
            gen_fp(emitter,fdivr_opc,false,stack,mem_manager,frame,operand_pool);
            break;
        case 0x6f:	// ddiv
            gen_fp(emitter,fdivr_opc,true,stack,mem_manager,frame,operand_pool);
            break;
            //
            // rem
            //
        case 0x70:	// irem
            gen_idiv(mem_manager,emitter,stack,operand_pool,0); // get remainder
            break;
        case 0x71:	// lrem
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_long_help_func(mem_manager,emitter,stack,code_patch_list,
                frame,ORP_RT_LREM,method_info);
            break;
        case 0x72:	// frem
            gen_frem(mem_manager,emitter,stack,operand_pool);
            break;
        case 0x73:	// drem
            gen_drem(mem_manager,emitter,stack,operand_pool);
            break;
            //
            // neg
            //
        case 0x74:	// ineg
            {
                src = stack.pop();
                Reg_Operand *dst_reg;
                // if src is not a local reg
                if (!src->is_reg() || !src->hold_local_reg(stack.reg_manager.local_regs())) {
                    src->free_opnd(&stack.reg_manager);
                    dst_reg = stack.reg_manager.get_reg();
                    src->emit_mov_to_reg(emitter,&dst_reg->opnd);
                } else {
                    dst_reg = (Reg_Operand*)src;
                }
                emitter.emit_neg(&dst_reg->opnd);
                stack.push(dst_reg);
                break;
            }
        case 0x75:	// lneg
            //
            //	neg	eax       ---- low 32
            //	adc edx       ---- high 32
            //	neg edx
            //
            stack.pop64(src_lo,src_hi);
            //
            // mov low 32 bits into a scratch reg
            //
            // if src_lo is not a local reg
            if (!src_lo->is_reg() || 
                !src_lo->hold_local_reg(stack.reg_manager.local_regs())) {
                Reg_Operand *reg = stack.reg_manager.get_reg();
                src_lo->free_opnd(&stack.reg_manager);
                src_lo->emit_mov_to_reg(emitter,&reg->opnd);
                src_lo = reg;
            }
            //
            // mov high 32 bits into a scratch reg
            //
            // if src_hi is not a local reg
            if (!src_hi->is_reg() ||
                !src_hi->hold_local_reg(stack.reg_manager.local_regs())) {
                src_hi->free_opnd(&stack.reg_manager);
                Reg_Operand *reg = stack.reg_manager.get_reg();
                src_hi->emit_mov_to_reg(emitter,&reg->opnd);
                src_hi = reg;
            }
            // src_lo and src_hi must be registers
            emitter.emit_neg(&((Reg_Operand*)src_lo)->opnd);
            emitter.emit_alu(adc_opc,&((Reg_Operand*)src_hi)->opnd, &Imm_Opnd(0));
            emitter.emit_neg(&((Reg_Operand*)src_hi)->opnd);
            stack.push64(src_lo,src_hi);
            break;
        case 0x76:	// fneg
            {
                //
                // fld	stack_top
                // fchs
                // fst	stack_top
                //
                Operand *src = stack.pop();
                
                if (stack.fp_strict_mode)
                {
                    assert(src->is_mem());
                    src->free_opnd(&stack.reg_manager);
                    Mem_Operand *m_src = (Mem_Operand*)src;
                    emitter.emit_fld(m_src->mem_opnd(),0);
                    emitter.emit_fchs();
                    Stack_Operand *dst = operand_pool.nth_stack(stack.depth());
                    stack.push(dst);
                    emitter.emit_fst(dst->mem_opnd(),0,1);
                }
                else
                {
                    load_onto_fp_stack(stack, src, false);

                    emitter.emit_fchs();
                    result_on_fp_stack(mem_manager, stack, false);
                }
                break;
            }
        case 0x77:	// dneg
            {
                //
                // fld	stack_top
                // fchs
                // fst	stack_top
                //
                stack.pop64(src,src_hi);
                
                if (stack.fp_strict_mode)
                {
                    assert(src->is_mem());
                    src->free_opnd(&stack.reg_manager);
                    Mem_Operand *m_src = (Mem_Operand*)src;
                    emitter.emit_fld(m_src->mem_opnd(),1);
                    emitter.emit_fchs();
                    Stack_Operand *dst = operand_pool.nth_stack(stack.depth()+1);
                    emitter.emit_fst(dst->mem_opnd(),1,1);
                    stack.push64(dst,operand_pool.nth_stack(stack.depth()));
                }
                else
                {
                    load_onto_fp_stack(stack, src, true);
                   
                    emitter.emit_fchs();
                    result_on_fp_stack(mem_manager, stack, true);
                }                
                break;
            }
            //
            // logical operations
            //
            //
            // shl
            //
        case 0x78: // ishl
            gen_shift(mem_manager,emitter,stack,shl_opc);
            break;
        case 0x79: // lshl
            gen_long_shift(mem_manager,emitter,stack,code_patch_list,
                operand_pool,ORP_RT_LSHL);
            break;
        case 0x7a: // ishr
            gen_shift(mem_manager,emitter,stack,sar_opc);
            break;
        case 0x7b: // lshr
            gen_long_shift(mem_manager,emitter,stack,code_patch_list,
                operand_pool,ORP_RT_LSHR);
            break;
        case 0x7c: // iushr
            gen_shift(mem_manager,emitter,stack,shr_opc);
            break;
        case 0x7d:	// lushr
            gen_long_shift(mem_manager,emitter,stack,code_patch_list,
                operand_pool,ORP_RT_LUSHR);
            break;
            //
            // and
            //
        case 0x7e:	// iand
            emit_alu32(mem_manager,emitter,stack,and_opc,1);
            break;
        case 0x7f:	// land
            emit_alu64(mem_manager,emitter,stack,operand_pool,and_opc,and_opc,1);
            break;
            //
            // or
            //
        case 0x80:	// ior
            emit_alu32(mem_manager,emitter,stack,or_opc,1);
            break;
        case 0x81:	// lor
            emit_alu64(mem_manager,emitter,stack,operand_pool,or_opc,or_opc,1);
            break;
            //
            // xor
            //
        case 0x82:	// ixor
            emit_alu32(mem_manager,emitter,stack,xor_opc,1);
            break;
        case 0x83:	// lxor
            emit_alu64(mem_manager,emitter,stack,operand_pool,xor_opc,xor_opc,1);
            break;
            //
            // iinc local, constant
            //
        case 0x84:  // iinc
            {
                //
                //	add var_offset[ebp],constant
                //
                index = *bc++;
                int inc = *(char*)bc++;
                gen_iinc(emitter,stack,method_info,
                    regalloc,
                    frame,index,inc);
            }
            break;
            
        case 0x85:	// i2l
            gen_i2l(emitter,stack);
            break;
        case 0x86:	// i2f
            gen_int2fp(mem_manager,emitter,stack,operand_pool,0);
            break;
        case 0x87:	// i2d
            gen_int2fp(mem_manager,emitter,stack,operand_pool,1);
            break;
        case 0x88:	// l2i
            //
            // keep low 32 bit
            //
            stack.pop64(src_lo,src_hi);
            stack.push(src_lo);
            if (src_hi->is_reg())
                src_hi->free_opnd(&stack.reg_manager);
            break;
        case 0x89:	// l2f
            gen_long2fp(mem_manager,emitter,stack,operand_pool,0);
            break;
        case 0x8a:	// l2d
            gen_long2fp(mem_manager,emitter,stack,operand_pool,1);
            break;
        case 0x8b:	// f2i
            gen_float2int(mem_manager,emitter,stack,code_patch_list,ORP_RT_F2I);
            break;
        case 0x8c:	// f2l
            gen_float2int(mem_manager,emitter,stack,code_patch_list,ORP_RT_F2L);
            break;
        case 0x8d:	// f2d
            gen_f2d(mem_manager,emitter,stack,operand_pool);
            break;
        case 0x8e:	// d2i
            gen_double2int(mem_manager,emitter,stack,code_patch_list,ORP_RT_D2I);
            break;
        case 0x8f:	// d2l
            gen_double2int(mem_manager,emitter,stack,code_patch_list,ORP_RT_D2L);
            break;
        case 0x90:	// d2f
            gen_d2f(mem_manager,emitter,stack,operand_pool);
            break;
        case 0x91:	// i2b
            //
            //	movsx	eax,[esp]
            //
            gen_int2bcs(emitter,stack,1,0);
            break;
        case 0x92:	// i2c
            //
            //	movzx	eax,[esp]
            //
            gen_int2bcs(emitter,stack,0,1);
            break;
        case 0x93:	// i2s
            gen_int2bcs(emitter,stack,1,1);
            break;
        case 0x94:	// lcmp
        case 0x95:  // fcmpl
        case 0x96:	// fcmpg
        case 0x97:	// dcmpl
        case 0x98:  // dcmpg
            {
                unsigned char next_bc = *bc;
                //
                // if next_bc is not if{eq,ne,lt,ge,gt,le}
                // emit code that produce the compare result (1, 0, or -1)
                //
                if (next_bc < 0x99 /* ifeq */ || next_bc > 0x9e /* ifle */)
                    emit_compare(emitter,stack,bytecode,operand_pool);
            }
            break;
        case 0x99: case 0x9a:	// if{eq,ne,lt,ge,gt,le} int comparisons against zero
        case 0x9b: case 0x9c:
        case 0x9d: case 0x9e:
            {
                int branch_offset = (*(char*)bc << 8) + bc[1];
                unsigned target_bc_index = (curr_bc + branch_offset) - first_bc;
                unsigned fall_thru_index = bc_index + 3;
                if (*prev_bc == 0x94) // lcmp
                    emit_lcmp_br(mem_manager,emitter,stack,code_patch_list,map,
                    bytecode,branch_offset<0,target_bc_index,fall_thru_index);
                else if (*prev_bc >= 0x95 && *prev_bc <= 0x98) // fcmpl, fcmpg, dcmpl, dcmpg
                    emit_fdcmp_br(mem_manager,emitter,stack,code_patch_list,map,
                    bytecode,*prev_bc,branch_offset<0,target_bc_index,fall_thru_index,operand_pool);
                else
                    emit_cmp0_br(mem_manager,emitter,stack,code_patch_list,map,
                    bytecode,branch_offset<0,target_bc_index);
                bc += 2;
                break;
            }
        case 0x9f: case 0xa0:	// if_icmp{eq,ne,lt,ge,gt,le}
        case 0xa1: case 0xa2:	// integer conditional branch
        case 0xa3: case 0xa4:
            {
                int branch_offset = (*(char*)bc << 8) + bc[1];
                unsigned target_bc_index = (curr_bc + branch_offset) - first_bc;
                //
                // emit the comparison code and return the condition code to be
                // used by the later branch instruction.
                //
                X86_CC cc = emit_cmp(emitter,stack,branch_cc[bytecode-0x9f]);
                emit_br(mem_manager,emitter,code_patch_list,map,cc,
                        target_bc_index,branch_offset<0,true);
                bc += 2;
                break;
            }
        case 0xa5: case 0xa6:	// if_acmp{eq,ne}
            {
                int branch_offset = (*(char*)bc << 8) + bc[1];
                unsigned target_bc_index = (curr_bc + branch_offset) - first_bc;
                //
                // emit the comparison code and return the condition code to be
                // used by the later branch instruction.
                //
                X86_CC cc = emit_cmp(emitter,stack,branch_cc[bytecode-0xa5]);
                emit_br(mem_manager,emitter,code_patch_list,map,cc,
                        target_bc_index,branch_offset<0,true);
                bc += 2;
                break;
            }
        case 0xa7:	// goto
            {
                stack.home_all(); // make sure all operands are spilled at the end of bb
                int branch_offset = (*(char*)bc << 8) + bc[1];
                unsigned target_bc_index = (curr_bc + branch_offset) - first_bc;
                if (branch_offset <= 0) {
                    //
                    // backward branches do not require patching
                    //
                    int disp = map[target_bc_index].offset - emitter.get_offset();
                    emitter.emit_jump(disp);
                } else {
                    //
                    // emit a branch with 32-bit offset that is later patched
                    //
                    emitter.emit_jump32(&Imm_Opnd(0));
                    //
                    // create a patch entry
                    //
                    unsigned patch_offset = emitter.get_offset() - 4;
                    Branch_Patch *p = new(mem_manager) 
                        Branch_Patch(code_patch_list,patch_offset,map[target_bc_index].patch);
                    code_patch_list = map[target_bc_index].patch = p;
                }
                bc += 2;
                break;
            }
        case 0xa8:	// jsr
        case 0xc9:  // jsr_w
            {
                stack.call_home(0);
                int branch_offset;
                if (bytecode == 0xa8) { // jsr
                    branch_offset = (*(char*)bc << 8) + bc[1];
                    bc += 2;
                } else { // jsr_w
                    branch_offset = ((bc[0]<<24) + (bc[1]<<16) + (bc[2]<<8) + bc[3]);
                    bc += 4;
                }

                unsigned target_bc_index = (curr_bc + branch_offset) - first_bc;
                //
                // mov the address of the instruction followed by jsr
                //
                emitter.emit_mov(&operand_pool.nth_stack(stack.depth())->opnd,&Imm_Opnd(0),opnd_32);
                unsigned patch_offset = emitter.get_offset() - 4;
                Branch_Patch *p = new(mem_manager) 
                    Mov_Patch(code_patch_list,patch_offset,map[bc_index + 3].patch);
                code_patch_list = map[bc_index+3].patch = p;
                if (branch_offset < 0) {
                    //
                    // backward branches do not require patching
                    //
                    int disp = map[target_bc_index].offset - emitter.get_offset();
                    emitter.emit_jump(disp);
                } else {
                    //
                    // emit a branch with 32-bit offset that is later patched
                    //
                    emitter.emit_jump32(&Imm_Opnd(0));
                    //
                    // create a patch entry
                    //
                    patch_offset = emitter.get_offset() - 4;
                    p = new(mem_manager) 
                        Branch_Patch(code_patch_list,patch_offset,map[target_bc_index].patch);
                    code_patch_list = map[target_bc_index].patch = p;
                }  
            }
            break;
        case 0xa9: // ret
            {
                index = *bc++;
                emitter.emit_jump(&M_Var_Opnd(frame,index));
            }
            break;
        case 0xaa: // tableswitch
            {
                // get the index from the top of stack
                src = stack.pop();
                stack.home_all(); // make sure all operands are spilled at the end of bb
                Reg_Operand *src_reg;
                // src is not a local reg
                if (!src->is_reg() || 
                    !src->hold_local_reg(stack.reg_manager.local_regs())) {
                    src->free_opnd(&stack.reg_manager);
                    src_reg = stack.reg_manager.get_reg();
                    src->emit_mov_to_reg(emitter,&src_reg->opnd);
                } else {
                    src_reg = (Reg_Operand*)src;
                }
                // skip over padding bytes to align on 4 byte boundary
                bc = (curr_bc+1) + ((4 - (bc_index+1)) & 0x03);
                // offset default label
                int default_offset = ((bc[0]<<24) + (bc[1]<<16) + (bc[2]<<8) + bc[3]);
                // low
                int low = ((bc[4]<<24) + (bc[5]<<16) + (bc[6]<<8) + bc[7]);
                // high
                int high = ((bc[8]<<24) + (bc[9]<<16) + (bc[10]<<8) + bc[11]);
                bc += 12;
                int n_entries = high - low + 1;
                //
                // cmp src_reg, low
                // jlt default_offset
                // cmp src_reg, high
                // jgt default_offset
                //
                emitter.emit_alu(cmp_opc,&src_reg->opnd,&Imm_Opnd(low));
                unsigned default_bc_index = (curr_bc + default_offset) - first_bc;
                if (default_offset < 0) {
                    //
                    // backward branches do not require patching
                    //
                    int disp = map[default_bc_index].offset - emitter.get_offset();
                    emitter.emit_branch(cc_lt,disp,1);
					//
					// For statistics, inserting instruments
					//
					if(inner_statistics){
						assert(emitter.prof_rec) ;
						inner_bb_instrumenting_code(emitter,
							(unsigned*)&((PROF_COUNTER*)&((unsigned short*)&emitter.prof_rec->back_edge[emitter.prof_rec->n_back_edge])[emitter.prof_rec->n_back_edge])[emitter.inner_bb_cnt_offset++]);
					}
                    emitter.emit_alu(cmp_opc,&src_reg->opnd,&Imm_Opnd(high));
                    disp = map[default_bc_index].offset - emitter.get_offset();
                    emitter.emit_branch(cc_gt,disp,1);
                } else {
                    //
                    // emit a branch with 32-bit offset that is later patched
                    //
                    emitter.emit_branch32(cc_lt,&Imm_Opnd(0),1);
                    //
                    // create a patch entry
                    //
                    unsigned patch_offset = emitter.get_offset() - 4;
                    Branch_Patch *p = new(mem_manager)
                        Branch_Patch(code_patch_list,patch_offset,map[default_bc_index].patch);
                    code_patch_list = map[default_bc_index].patch = p;
					//
					// For statistics, inserting instruments
					//
					if(inner_statistics){
						assert(emitter.prof_rec) ;
						inner_bb_instrumenting_code(emitter,
							(unsigned*)&((PROF_COUNTER*)&((unsigned short*)&emitter.prof_rec->back_edge[emitter.prof_rec->n_back_edge])[emitter.prof_rec->n_back_edge])[emitter.inner_bb_cnt_offset++]);
					}
                    //
                    // cmp src_reg,high
                    // jgt
                    //
                    emitter.emit_alu(cmp_opc,&src_reg->opnd,&Imm_Opnd(high));
                    emitter.emit_branch32(cc_gt,&Imm_Opnd(0),1);
                    patch_offset = emitter.get_offset() - 4;
                    p = new(mem_manager)
                        Branch_Patch(code_patch_list,patch_offset,map[default_bc_index].patch);
                    code_patch_list = map[default_bc_index].patch = p;
                }
				//
				// For statistics, inserting instruments
				//
				if(inner_statistics){
					assert(emitter.prof_rec) ;
					inner_bb_instrumenting_code(emitter,
						(unsigned*)&((PROF_COUNTER*)&((unsigned short*)&emitter.prof_rec->back_edge[emitter.prof_rec->n_back_edge])[emitter.prof_rec->n_back_edge])[emitter.inner_bb_cnt_offset++]);
				}
                //
                // sub  src_reg,low		; only if low != 0
                // shl  src_reg, 2
                // jmp  [src_reg + table_base]
                //
                if (low != 0) {
                    emitter.emit_alu(sub_opc,&src_reg->opnd,&Imm_Opnd(low));
                }
                emitter.emit_shift(shl_opc,&src_reg->opnd,&Imm_Opnd(2));
                Data_Label *data_label = ro_data_emitter.make_label();
                emitter.emit_jump(&M_Base_Patch_Opnd(src_reg->opnd.reg_no(),data_label));
                //
                // fill in the jump table
                //
                for (int i = 0; i < n_entries; i++) {
                    int offset = ((bc[0]<<24) + (bc[1]<<16) + (bc[2]<<8) + bc[3]);
                    bc += 4;
                    unsigned target_bc_index = (curr_bc + offset) - first_bc;
                    if (offset < 0) {
                        //
                        // backward: target byte code already visited --> use target's offset
                        //
                        table_entry_patch_list = new(mem_manager)
                            Table_Entry_Patch(table_entry_patch_list,ro_data_emitter.get_offset(),
                            map[target_bc_index].offset);
                    } else {
                        //
                        // forward: put on target byte code's patch list
                        //
                        map[target_bc_index].patch = table_entry_patch_list = new(mem_manager)
                            Table_Entry_Patch(table_entry_patch_list,ro_data_emitter.get_offset(),
                            map[target_bc_index].patch);
                    }
                    //
                    // allocate the an entry in the read only data block
                    //
                    ro_data_emitter.emit_int(0);
                }
                break;
        }
        case 0xab:	// lookupswitch
            {
                // get the key from top of stack
                src = stack.pop();
                stack.home_all(); // make sure all operands are spilled at the end of bb
                Reg_Operand *src_reg;
                if (!src->is_reg()) {
                    src->free_opnd(&stack.reg_manager);
                    src_reg = stack.reg_manager.get_reg();
                    src->emit_mov_to_reg(emitter,&src_reg->opnd);
                } else {
                    src_reg = (Reg_Operand*)src;
                }
                //
                // skip over padding bytes to align on 4 byte boundary
                //
                bc = (curr_bc+1) + ((4 - (bc_index+1)) & 0x03);
                //
                // offset default label
                //
                int default_offset = ((bc[0]<<24) + (bc[1]<<16) + (bc[2]<<8) + bc[3]);
                //
                // number of match-offset pairs in lookup table
                //
                int npairs = ((bc[4]<<24) + (bc[5]<<16) + (bc[6]<<8) + bc[7]);
                bc += 8;
				int i = 0 ;//for Linux, need defined here
                for (i = 0; i < npairs; i++) {
					//
					// For statistics, inserting instruments
					//
					if(inner_statistics && i>0){ //before the next cmp
						assert(emitter.prof_rec) ;
						inner_bb_instrumenting_code(emitter,
							(unsigned*)&((PROF_COUNTER*)&((unsigned short*)&emitter.prof_rec->back_edge[emitter.prof_rec->n_back_edge])[emitter.prof_rec->n_back_edge])[emitter.inner_bb_cnt_offset++]);
					}

                    // get the match
                    int match_value = ((bc[0]<<24) + (bc[1]<<16) + (bc[2]<<8) + bc[3]);
                    // get the offset
                    int branch_offset = ((bc[4]<<24) + (bc[5]<<16) + (bc[6]<<8) + bc[7]);
                    unsigned target_bc_index = (curr_bc + branch_offset) - first_bc;
                    //
                    // cmp	reg, match
                    // beq	offset
                    //
                    emitter.emit_alu(cmp_opc,&src_reg->opnd,&Imm_Opnd(match_value));
                    if (branch_offset < 0) {
                        //
                        // backward branches do not require patching
                        //
                        unsigned target_offset = map[target_bc_index].offset;
                        int disp = target_offset - emitter.get_offset();
                        emitter.emit_branch(cc_eq,disp,0);
                    } else {
                        //
                        // emit a branch with 32-bit offset that is later patched
                        //
                        emitter.emit_branch32(cc_eq,&Imm_Opnd(0),1);
                        //
                        // create a patch entry
                        //
                        unsigned patch_offset = emitter.get_offset() - 4;
                        Branch_Patch *p = new(mem_manager) 
                            Branch_Patch(code_patch_list,patch_offset,map[target_bc_index].patch);
                        code_patch_list = map[target_bc_index].patch = p;
                    }
                    bc += 8;
                }
				//
				// For statistics, inserting instruments
				//
				if(inner_statistics && i>0){ //the last one!
					assert(emitter.prof_rec) ;
					inner_bb_instrumenting_code(emitter,
						(unsigned*)&((PROF_COUNTER*)&((unsigned short*)&emitter.prof_rec->back_edge[emitter.prof_rec->n_back_edge])[emitter.prof_rec->n_back_edge])[emitter.inner_bb_cnt_offset++]);
				}

                // jump to default label
                unsigned default_bc_index = (curr_bc + default_offset) - first_bc;
                if (default_offset < 0) {
                    //
                    // backward branches do not require patching
                    //
                    int disp = map[default_bc_index].offset - emitter.get_offset();
                    emitter.emit_jump(disp);
                } else {
                    //
                    // emit a branch with 32-bit offset that is later patched
                    //
                    emitter.emit_jump32(&Imm_Opnd(0));
                    //
                    // create a patch entry
                    //
                    unsigned patch_offset = emitter.get_offset() - 4;
                    Branch_Patch *p = new(mem_manager) 
                        Branch_Patch(code_patch_list,patch_offset,map[default_bc_index].patch);
                    code_patch_list = map[default_bc_index].patch = p;
                }
                break;
            }
        case 0xac:	// ireturn
        case 0xb0:	// areturn
            //
            //	put return value into eax and emit epilog
            //
            emit_epilog(emitter,frame,callee_saved_regs_mask,
                1,method_handle,method_flags,mem_manager,code_patch_list,
                method_info,stack,prepass,curr_bc);
            break;
        case 0xae:	// freturn
            {
                emit_epilog(emitter,frame,callee_saved_regs_mask,
                    0,method_handle,method_flags,mem_manager,code_patch_list,
                    method_info,stack,prepass,curr_bc);
                break;
            }
        case 0xad:	// lreturn
            {
                emit_epilog(emitter,frame,callee_saved_regs_mask,
                    2,method_handle,method_flags,mem_manager,code_patch_list,
                    method_info,stack,prepass,curr_bc);
            }
            break;
        case 0xaf:	// dreturn
            //
            // return 64-bit fp value
            //
            {
                emit_epilog(emitter,frame,callee_saved_regs_mask,
                    0,method_handle,method_flags,mem_manager,code_patch_list,
                    method_info,stack,prepass,curr_bc);
                break;
            }
        case 0xb1:	// return
            //
            // return without value
            //
            emit_epilog(emitter,frame,callee_saved_regs_mask,
                0,method_handle,method_flags,mem_manager,code_patch_list,
                method_info,stack,prepass,curr_bc);
            break;
        case 0xb2:	// getstatic
            index = (bc[0] << 8) + bc[1];
            bc += 2;
            if (gen_getstatic(mem_manager, emitter, stack, class_handle,
                compilation_handle, index, code_patch_list, method_info, 
                num_get_put_static, class_initializer, code_mi)) {
                VMresFailed = 1;
                break;
            }
            break;
        case 0xb3:	// putstatic
            index = (bc[0] << 8) + bc[1];
            bc += 2;
            if (gen_putstatic(mem_manager, emitter, stack, class_handle,
                compilation_handle, index, code_patch_list, method_info, 
                num_get_put_static, class_initializer, code_mi)) {
                VMresFailed = 1;
                break;
            }
            break;
        case 0xb4:	// getfield
            index = (bc[0] << 8) + bc[1];
            bc += 2;
            if (gen_getfield(mem_manager,emitter,code_patch_list,stack,class_handle,
                compilation_handle,index,method_info)) {
                VMresFailed = 1;
                break;
            }
            break;
        case 0xb5:	// putfield
			index = (bc[0] << 8) + bc[1];
            bc += 2;
            if (gen_putfield(mem_manager,emitter,code_patch_list,stack,class_handle,
                compilation_handle,(flags.insert_write_barriers == TRUE),index,method_info)) {
                VMresFailed = 1;
                break;
            }
            break;
        case 0xb6:	// invokevirtual
            index = (bc[0] << 8) + bc[1];
            bc += 2;
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            if (gen_invokevirtual(mem_manager,emitter,stack,operand_pool,
                frame,class_handle,compilation_handle,index,method_info)) {
                VMresFailed = 1;
                break;
            }
            break;
        case 0xb7:	// invokespecial
            index = (bc[0] << 8) + bc[1];
            bc += 2;
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            if (gen_invokespecial(mem_manager,emitter,stack,operand_pool,
                frame,class_handle,compilation_handle,index,method_info)) {
                VMresFailed = 1;
                break;
            }
            break;
        case 0xb8:	// invokestatic
            index = (bc[0] << 8) + bc[1];
            bc += 2;
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            if (gen_invokestatic(mem_manager,emitter,stack,operand_pool,
                frame,class_handle,compilation_handle,index,method_info)) {
                VMresFailed = 1;
                break;
            }
            break;
        case 0xb9: {// invokeinterface
            index = (bc[0] << 8) + bc[1];
            unsigned n_args = bc[2];
            bc += 4;
            // actual invoke call site
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            // helper call site
            method_info->cs_info[method_info->cnt+1].call_IP = (unsigned)emitter.get_offset();
            if (gen_invokeinterface(mem_manager,emitter,stack,operand_pool,
                frame,class_handle,compilation_handle,index,method_info,
                n_args,rw_data_emitter,code_patch_list)) {
                VMresFailed = 1;
                break;
            }
                   }	break;
        case 0xba:				// unused
            break;
        case 0xbb:	// new
            index = (bc[0] << 8) + bc[1];
            bc += 2;
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            if (gen_new(mem_manager,emitter,stack,code_patch_list,class_handle,
                compilation_handle,index,method_info)) {
                VMresFailed = 1;
                break;
            }
            break;
        case 0xbc:	// newarray
            index = *bc++;	// type code of array
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_newarray(mem_manager,emitter,stack,code_patch_list,
#ifndef NO_BOUNDS_CHECKING
                class_handle,index,method_info,frame,bounds);
#else // NO_BOUNDS_CHECKING
            class_handle,index,method_info,frame);
#endif // NO_BOUNDS_CHECKING
            break;
        case 0xbd:{	// anewarray
            index = (bc[0] << 8) + bc[1];
            bc += 2;
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_anewarray(mem_manager, emitter, stack, code_patch_list,
                class_handle, compilation_handle, index, method_info, frame
#ifndef NO_BOUNDS_CHECKING
                , bounds
#endif // NO_BOUNDS_CHECKING
                );
                  }	break;
        case 0xbe:	// arraylength
            {
                Reg_Operand *reg;
                Operand *array = stack.pop();
                if (!array->is_reg()) {
                    array->free_opnd(&stack.reg_manager);
                    reg = stack.reg_manager.get_reg();
                    array->emit_mov_to_reg(emitter,&reg->opnd);
                    array = reg;
                }
                array->free_opnd(&stack.reg_manager);
                reg = stack.reg_manager.get_reg();
//zying1
#ifndef OLD_OBJ_LAYOUT
				int length_offset = array_length_offset();
				M_Base_Opnd length(((Reg_Operand*)array)->opnd.reg_no(),length_offset);
#else
				M_Base_Opnd length(((Reg_Operand*)array)->opnd.reg_no(),4);
#endif
                emitter.emit_mov(&reg->opnd,&length);
                stack.push(reg);
            }
            break;
        case 0xbf:	// athrow
            //
            // call Throw(objectref)
            //
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_athrow(mem_manager,emitter,stack,code_patch_list,method_info,frame);
            break;
        case 0xc0:	// checkcast
            index = (bc[0] << 8) + bc[1];
            bc += 2;
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            if (gen_checkcast(mem_manager,emitter,stack,code_patch_list,
                class_handle,compilation_handle,index,method_info,frame)) {
                VMresFailed = 1;
                break;
            }
            break;
        case 0xc1:	// instanceof
            index = (bc[0] << 8) + bc[1];
            bc += 2;
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            if (gen_instanceof(mem_manager,emitter,stack,code_patch_list,
                class_handle,compilation_handle,index,method_info,frame)) {
                VMresFailed = 1;
                break;
            }
            break;
        case 0xc2:	// monitorenter
            //
            // call MonEnter(objectref)
            //
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_monitorenter(mem_manager,emitter,stack,code_patch_list,method_info,frame);
            break;
        case 0xc3:	// monitorexit
            //
            // call MonExit(objectref)
            //
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_monitorexit(mem_manager,emitter,stack,code_patch_list,method_info,frame);
            break;
        case 0xc4:	// wide
            bytecode = *bc++;
            index = (bc[0] << 8) + bc[1];
            if (bytecode == 0x84) {
                // iinc local, constant
                bc += 2;
                imm.value = (*(char*)bc << 8) + bc[1];
                bc += 2;
                gen_iinc(emitter,stack,method_info,
                    regalloc,
                    frame,index,imm.value);
            } else {
                //
                // The wide case happens rarely.  None of registers is assigned to 
                // those variables.
                //
                bc += 2;
                switch (bytecode) {
                case 0x15:	// iload
                case 0x17:	// fload
                    stack.push(new (mem_manager) Mem_Var_Operand(frame,index,0));
                    break;
                case 0x19:	// aload
                    stack.push(new (mem_manager) Mem_Var_Operand(frame,index,1));
                    break;
                case 0x16:	// lload
                    stack.push64(new(mem_manager) Mem_Var_Operand(frame,index+1,0),
                                 new(mem_manager) Mem_Var_Operand(frame,index,0));
                    break;
                case 0x18:	// dload
                    load_double(emitter,frame,stack,mem_manager,index);
                    break;
                case 0x36:	// istore
                case 0x38:	// fstore
                    src = stack.pop();
                    gen_store32(emitter,stack,&M_Var_Opnd(frame,index),src);
                    break;
                case 0x3a:	// astore
                    src = stack.pop();
                    gen_store32(emitter,stack,&M_Var_Opnd(frame,index),src);
#ifndef NO_BOUNDS_CHECKING
                    bounds.reset(index);
#endif // NO_BOUNDS_CHECKING
                    break;
                case 0x37:	// lstore
                    stack.pop64(src_lo,src_hi);
                    gen_store32(emitter,stack,&M_Var_Opnd(frame,index),src_hi);
                    gen_store32(emitter,stack,&M_Var_Opnd(frame,index+1),src_lo);
                    break;
                case 0x39:	// dstore
                    stack.pop64(src_lo,src_hi);
                    store_double(emitter,frame,stack,index,src_lo,src_hi);
                    break;
                    
                case 0xa9: // ret
                    emitter.emit_jump(&M_Var_Opnd(frame,index));
                    break;
                } // switch
            }
            break;
        case 0xc5: {// multianewarray
            index = (bc[0] << 8) + bc[1];
            bc += 2;
            int dimensions = *bc++;
            method_info->cs_info[method_info->cnt].call_IP = (unsigned)emitter.get_offset();
            gen_multianewarray(mem_manager, emitter, stack, code_patch_list,
                dimensions, class_handle, compilation_handle, index,
                method_info, frame);
                   }	break;
            //
            // ifnull,ifnonnull
            //
        case 0xc6:	// ifnull
            {
                int branch_offset = (*(char*)bc << 8) + bc[1];
                unsigned target_bc_index = (curr_bc + branch_offset) - first_bc;
                emit_cmp_zero(emitter,stack,bytecode);
                if (branch_offset < 0) {
                    //
                    // backward branches do not require patching
                    //
                    unsigned target_offset = map[target_bc_index].offset;
                    int disp = target_offset - emitter.get_offset();
                    emitter.emit_branch(cc_eq,disp,0);
                } else {
                    //
                    // emit a branch with 32-bit offset that is later patched
                    //
                    emitter.emit_branch32(cc_eq,&Imm_Opnd(0),0);
                    //
                    // create a patch entry
                    //
                    unsigned patch_offset = emitter.get_offset() - 4;
                    Branch_Patch *p = new(mem_manager) 
                        Branch_Patch(code_patch_list,patch_offset,map[target_bc_index].patch);
                    code_patch_list = map[target_bc_index].patch = p;
                }
                bc += 2;
                break;
            }
        case 0xc7:	// ifnonnull
            {
                int branch_offset = (*(char*)bc << 8) + bc[1];
                unsigned target_bc_index = (curr_bc + branch_offset) - first_bc;
                emit_cmp_zero(emitter,stack,bytecode);
                if (branch_offset < 0) {
                    //
                    // backward branches do not require patching
                    //
                    unsigned target_offset = map[target_bc_index].offset;
                    int disp = target_offset - emitter.get_offset();
                    emitter.emit_branch(cc_ne,disp,0);
                } else {
                    //
                    // emit a branch with 32-bit offset that is later patched
                    //
                    emitter.emit_branch32(cc_ne,&Imm_Opnd(0),0);
                    //
                    // create a patch entry
                    //
                    unsigned patch_offset = emitter.get_offset() - 4;
                    Branch_Patch *p = new(mem_manager) 
                        Branch_Patch(code_patch_list,patch_offset,map[target_bc_index].patch);
                    code_patch_list = map[target_bc_index].patch = p;
                }
                bc += 2;
                break;
            }
        case 0xc8:						// goto_w
            {
                stack.home_all(); // make sure all operands are spilled at the end of bb
                int branch_offset = ((bc[0]<<24) + (bc[1]<<16) + (bc[2]<<8) + bc[3]);
                unsigned target_bc_index = (curr_bc + branch_offset) - first_bc;
                if (branch_offset <= 0) {
                    //
                    // backward branches do not require patching
                    //
                    int disp = map[target_bc_index].offset - emitter.get_offset();
                    emitter.emit_jump(disp);
                } else {
                    //
                    // emit a branch with 32-bit offset that is later patched
                    //
                    emitter.emit_jump32(&Imm_Opnd(0));
                    //
                    // create a patch entry
                    //
                    unsigned patch_offset = emitter.get_offset() - 4;
                    Branch_Patch *p = new(mem_manager) 
                        Branch_Patch(code_patch_list,patch_offset,map[target_bc_index].patch);
                    code_patch_list = map[target_bc_index].patch = p;
                }
                bc += 4;
                break;
            }
        default:
            return JIT_FAILURE;
            break;
        } // switch
        
        if (VMresFailed)
            break;  // while loop
        //
        // update branch patches for this instruction, if any
        //
        unsigned target_offset = map[bc_index].offset;
        while (patch_list != NULL) {
            Branch_Patch *next = patch_list->next_branch;
            patch_list->target_offset = target_offset;
            patch_list = next;
        }
    } // while
    if (VMresFailed) {
        return JIT_FAILURE;
    }
    //
    // inserting code to call recompilation
    //
    insert_call_recompilation(mem_manager,emitter,code_patch_list,prof_patch,
        method_get_JIT_id(compilation_handle),method_handle,
        method_info,prepass.recomp_entries);
    //
    // successful compilation!
    // commit the emitted code to its final resting place
    //
    unsigned code_size = emitter.get_size();
    // Add in size for out-of-line patches.
    unsigned old_size = code_size;
    {
        Runtime_Throw_Info *runtime_throws = method_info->runtime_throws;
        while(runtime_throws) {
            runtime_throws = runtime_throws->next;
            code_size += 10;    // push, jmp
			if(inner_statistics){ //add, adc , for statistics
				code_size += 7 ;
				if(sizeof(PROF_COUNTER) == sizeof(int64))
					code_size += 7 ;
			}
        }
    }
    //
    // allocate the code block and the ro and rw data blocks
    //
    char *code_block = NULL;
    if (code_mi != cm_gen_method) {
        code_block = (char *)method_allocate_code_block(method_handle,
            method_get_JIT_id(compilation_handle),
            code_size);
#if 0
        cout << "\t" << ken_total_gen << "\t" << code_size << endl;
        ken_total_gen++;
        ken_total_gen_code += code_size;
        ken_total_gen_const_data += (prepass.ro_data_size() + prepass.rw_data_size());
#endif // 0
        char *ro_data_block =
            (char *)method_allocate_data_block(method_handle,
            method_get_JIT_id(compilation_handle),
            prepass.ro_data_size() + prepass.rw_data_size());
        char *rw_data_block = ro_data_block + prepass.ro_data_size();
        //
        // copy ro_data, rw_data, and code to ro_data_block, rw_data_block, and code_block
        // the data blocks must be done before the code block because copying of data blocks
        // causes the labels to be patched into the code emitter's code buffers.
        //
        ro_data_emitter.copy(ro_data_block);
        rw_data_emitter.copy(rw_data_block);
        emitter.copy(code_block);
		
		//
        // apply the switch table entry patches
        // switch table entries in the ro_data_block are filled in with final addresses
        // in the code block
        //
        Table_Entry_Patch *p = table_entry_patch_list;
        while (p != NULL) {
            p->apply(code_block,ro_data_block);
            p = p->next();
        }

    }
    //
    // we sort call sites so that unwindstackframe can use binary search to find
    // the call site info.
    //
    // JMS -- This is an optimization that is hopefully correct.
    // In addition, it makes it so that only true call sites are
    // stored in the final method_info, so it will be easier to
    // give the list of GC safe points to the Intel ORP.
    // Also note that qsort() was operating on the uninitialized
    // ret_IP fields of the bogus entries, so it's possible that
    // one of them could "accidentally" match that of a real entry,
    // possibly causing horrible GC-related bugs.
	if(method_info->num_call_sites < method_info->cnt){
		assert(0);
	}
    method_info->num_call_sites = method_info->cnt;
    // JMS -- end of optimization
    unsigned n_call_sites = method_info->num_call_sites;
    Call_Site_Info **sorted_call_sites = NULL;
    if (n_call_sites > 0) 
        sorted_call_sites = (Call_Site_Info**)mem_manager.alloc(
        n_call_sites * sizeof(Call_Site_Info*));
    Call_Site_Info *mi_cs = method_info->cs_info;
    // initialization before sorting
    unsigned i;
    for (i = 0; i < n_call_sites; i++)
        sorted_call_sites[i] = &mi_cs[i];
    
    void qsort(Call_Site_Info *A[], int p, int r);
    if (n_call_sites > 1) 
        qsort(sorted_call_sites, 0, n_call_sites-1);
    //
    // to make garbage collection work for references on the stack, info is gathered
    // in prepass about the types of values on the stack at each garbage collectable
    // site (invoke*, *new*). transfer this information into the old method info block
    // for copying to the new method info block
    //
    extern int binary_search_call_site(Call_Site_Info *cs_info[], 
        int num_call_sites, unsigned call_IP);
    Call_BV_List_Element *bvle;
    bool last_was_invokeinterface = FALSE;
    while((bvle = (Call_BV_List_Element*)prepass.gc_site_vectors->pop()) != NULL) {
        const unsigned char *call_site_bc = bvle->id;
        // find call site that corresponds to this info
        unsigned off = map[call_site_bc - first_bc].offset;
        
        int csi_idx = binary_search_call_site(sorted_call_sites,method_info->num_call_sites, off);
        //
        // This could happen when we perform optimization.  For instance, if an array
        // access with index = 0, we don't need to check array bound.
        //
        if (csi_idx == -1) continue;
        Call_Site_Info *csi = sorted_call_sites[csi_idx];
        // for invokeinterface, there are two call sites: (1) helper (2) actual invoke
        // the helper will be second on the bit_vector list and in the array. so, the
        // actual invoke call site comes through and works just fine. the helper call
        // site needs to be put into the next cs_info[] entry to avoid overwriting
        // the already recorded info for he actual invoke site. Be careful here when
        // any changes are made, such as changing how things are placed on tbe Bit_Vector
        // list, or changing the above search to a binary search.
        if (*bvle->id == 0xb9 /*invokeinterface*/) {
            // if last bit_vector belonged to 
            if (last_was_invokeinterface) {
                //
                // we found the first entry so we increment csi to get to the 2nd one
                //
                if (csi_idx == 0 || sorted_call_sites[csi_idx-1]->call_IP != off)
                    csi = sorted_call_sites[csi_idx+1];
                last_was_invokeinterface = FALSE;
            } else {
                //
                // we found the 2nd entry so we decrement csi to get to the 1st one
                //
                if (csi_idx != 0 && sorted_call_sites[csi_idx-1]->call_IP == off)
                    csi = sorted_call_sites[csi_idx-1];
                last_was_invokeinterface = TRUE;
            }
        }
        csi->bc = call_site_bc;
        
        Bit_Vector *bv = bvle->elem;
        for (i = 0; i < bv->numbits(); i++) {
            if (bv->is_set(i)) {
                SET_TV_STACK(csi, i);
            }
        }
        csi->stack_depth = bv->numbits();
#ifdef LOCAL_CALLEE
        csi->stack_ref_in_local_callee = bvle->stack_ref_in_local_callee;
#endif
    }
    
    for (Recomp_Entry *re = prepass.recomp_entries; re != NULL; re = re->next) {
        
        int csi_idx = binary_search_call_site(sorted_call_sites,
            method_info->num_call_sites, 
            re->IP_offset);
        assert(csi_idx != -1);
        Call_Site_Info *csi = sorted_call_sites[csi_idx];
        
        Bit_Vector *bv = re->ref_bv;
        for (unsigned i = 0; i < bv->numbits(); i++) {
            if (bv->is_set(i)) {
                SET_TV_STACK(csi, i);
            }
        }
        csi->stack_depth = bv->numbits();
    }
    
    if(code_block != NULL && method_info->runtime_throws) {
        char *throws = code_block + old_size;
        Runtime_Throw_Info *runtime_throws = method_info->runtime_throws;
        while(runtime_throws) {
            // We add 5, but the branch instruction is 6-byte long, so return_addr
            // points to the middle of the branch instruction.  The exception
            // handling code of the ORP assumes that the exception is thrown with
            // a call instruction and subtract 5 bytes before processing the
            // the exception throw.
            unsigned return_addr = (unsigned)(code_block + (runtime_throws->offset + 5));
            unsigned try_index = runtime_throws->try_index;
            void *handler_addr = orp_get_rt_support_addr(ORP_RT_IDX_OUT_OF_BOUNDS);
            char *mrt_start = throws;
            
			//
			// Instrument counters after ireturn, for statistics
			//
			if(inner_statistics){
				unsigned cnt_addr = (unsigned)&((PROF_COUNTER*)&((unsigned short*)&emitter.prof_rec->back_edge[emitter.prof_rec->n_back_edge])[emitter.prof_rec->n_back_edge])[emitter.inner_bb_cnt_offset++] ;
				assert(emitter.offset_buf) ;
				assert(emitter.offset_buf_offset < 9997) ;
				unsigned buf_len = emitter.prof_rec->n_back_edge * (unsigned)INNER_BRANCH > MIN_INNER_BRANCH_SIZE ? 
								emitter.prof_rec->n_back_edge * (unsigned)INNER_BRANCH : MIN_INNER_BRANCH_SIZE ;
				assert(emitter.inner_bb_cnt_offset < buf_len-1) ;
				emitter.offset_buf[emitter.offset_buf_offset*2] = (unsigned)throws ;
				emitter.offset_buf[emitter.offset_buf_offset*2+1] = (unsigned)cnt_addr ;
				emitter.offset_buf_offset++ ;

				//add 1 -> [cnt_addr]
				*throws++ = '\x83' ;
				*throws++ = '\x05' ;
				*throws++ = cnt_addr & 0xff ;
				*throws++ = (cnt_addr >> 8) & 0xff;
				*throws++ = (cnt_addr >> 16) & 0xff;
				*throws++ = (cnt_addr >> 24) & 0xff;
				*throws++ = '\x01' ;

				if (sizeof(PROF_COUNTER) == sizeof(int64)){
					//adc 0 -> [cnt_addr+4]
					cnt_addr += 4 ;
					*throws++ = '\x83' ;
					*throws++ = '\x15' ;
					*throws++ = cnt_addr & 0xff ;
					*throws++ = (cnt_addr >> 8) & 0xff;
					*throws++ = (cnt_addr >> 16) & 0xff;
					*throws++ = (cnt_addr >> 24) & 0xff;
					*throws++ = '\x00' ;
				}//if sizeof
			}//if inner_statistics

            // push return_address
            *throws++ = '\x68';
            *throws++ = return_addr & 0xff;
            *throws++ = (return_addr >> 8) & 0xff;
            *throws++ = (return_addr >> 16) & 0xff;
            *throws++ = (return_addr >> 24) & 0xff;
            
            unsigned handler_offset = ((unsigned)handler_addr) - ((unsigned)throws + 5);
            // jmp handler
            *throws++ = '\xe9';
            *throws++ = handler_offset & 0xff;
            *throws++ = (handler_offset >> 8) & 0xff;
            *throws++ = (handler_offset >> 16) & 0xff;
            *throws++ = (handler_offset >> 24) & 0xff;
            
            char *mrt_patch = code_block + (runtime_throws->offset + 2);
            unsigned mrt_offset = ((unsigned)mrt_start) - ((unsigned)mrt_patch + 4);
            *mrt_patch++ = mrt_offset & 0xff;
            *mrt_patch++ = (mrt_offset >> 8) & 0xff;
            *mrt_patch++ = (mrt_offset >> 16) & 0xff;
            *mrt_patch++ = (mrt_offset >> 24) & 0xff;
            
            runtime_throws = runtime_throws->next;
        }
    }
    
    //
    // create small method info
    //
    if (smi == NULL) {
        assert(num_get_put_static == prepass.num_get_put_static);
        //
        // Small_Method_Info has 32 bit reserved (class_initializer[1]).  
        // Here we compute how many extra words needed.
        // 
        unsigned bv_wd = num_get_put_static/(sizeof(unsigned)*8);
        unsigned sz = sizeof(Small_Method_Info) + sizeof(unsigned)*bv_wd;
        smi = (Small_Method_Info *)method_allocate_info_block(method_handle,
              method_get_JIT_id(compilation_handle),sz);
        smi->mi = NULL;
        smi->prof_rec = prof_rec;
        //
        // record class_initializer bit vector
        //
        for (unsigned i = 0; i < bv_wd+1; i++)
            smi->class_initializer[i] = class_initializer[i];
#if 0
        ken_total_gen_info += sz;
#endif // 0
    }
    //
    // commit the method_info block
    //
    if (code_mi != cm_gen_code) {
        unsigned mi_size = gen_method_info_size(method_info, regalloc);
        Jit_Method_Info *new_mi = gen_perm_method_info(method_info,
            method_handle,
            compilation_handle,
            mi_size,
            (code_block != NULL) ? code_block : m_code,
            regalloc,
            sorted_call_sites);
        //
        // take away the assertion because mulitple threads may generate
        // the same method info at the same time.
        // assert(smi->mi == NULL);
        smi->mi = new_mi;
#if 0
        ken_total_gen_mi++;
        ken_total_gen_data += mi_size;
#endif // 0
    }
    
#if 0
    static unsigned total_code_size = 0;
    static unsigned total_info_size = 0;
    static unsigned method_count = 0;
    total_code_size += code_size;
    total_info_size += mi_size;
    method_count ++;
    cout << class_get_name(class_handle) << "." << method_get_name(method_handle) << ":" << endl;
    cout << "\tcode_size " << code_size << endl;
    cout << "\tinfo_size " << mi_size << endl;
    cout << "\tavg       " << ((double) mi_size) / code_size
        << " info bytes per code byte" << endl;
    cout << "\tCumulative code_size " << total_code_size << endl;
    cout << "\tCumulative info_size " << total_info_size << endl;
    cout << "\tCumulative avg:      " << ((double) total_info_size) / total_code_size << endl;
#endif // 1
    
    if (code_block != NULL) {
        //
        // apply the code patches
        //
        for (Patch *code_patch = code_patch_list;
        code_patch != NULL;
        code_patch = code_patch->next()) {
            code_patch->apply(code_block);
        }
        //
        // update exception handler info
        //
        unsigned cEH = method_get_num_handlers(method_handle);
        method_set_num_target_handlers(method_handle,
            method_get_JIT_id(compilation_handle),
            cEH);
        for(unsigned e=0; e<cEH; e++) {
            unsigned tryBegOffsPtr, tryEndOffsPtr, handlerOffsPtr, handlerTypePtr;
            method_get_handler_info(method_handle,e,
                &tryBegOffsPtr,&tryEndOffsPtr,&handlerOffsPtr,&handlerTypePtr);
            tryBegOffsPtr  = (unsigned)(map[tryBegOffsPtr].offset);
            tryEndOffsPtr  = (unsigned)(map[tryEndOffsPtr].offset);
            unsigned old_handler = handlerOffsPtr;
            handlerOffsPtr = (unsigned)(map[handlerOffsPtr].offset);
#if 0
            method_set_target_handler_info(method_handle,e,
                code_block+tryBegOffsPtr,code_block+tryEndOffsPtr,
                code_block+handlerOffsPtr,handlerTypePtr,
                (first_bc[old_handler] == 0x57 /* pop */));
#else
            Class_Handle exc_ch;
            if(handlerTypePtr) {
                Loader_Exception ld_exc;
                exc_ch = resolve_class(compilation_handle,
                    method_get_class(method_handle),
                    handlerTypePtr,
                    &ld_exc);
                assert(exc_ch);
            } else {
                exc_ch = 0;
            }
            method_set_target_handler_info(method_handle,
                method_get_JIT_id(compilation_handle),
                e,
                code_block + tryBegOffsPtr,
                code_block + tryEndOffsPtr,
                code_block + handlerOffsPtr,
                exc_ch,
                (first_bc[old_handler] == 0x57 /* pop */));
#endif
        }
        
#ifdef DUMP_JIT
        //---------------------
        
        if (L1a_do_dumpjit)
        {
            L1a_dump_jit(method_handle, first_bc, code_len, map, prepass,
                method_name, code_block, code_size);
        }
        //---------------------
#endif

		//
		// For O1 statistics
		//
		if(code_block && statistics && (emitter.offset_buf_offset > 0)){
			if ( emitter_offset_fp == NULL)
				emitter_offset_fp = fopen( "o1_dumpjit.txt", "w+");

			assert(emitter_offset_fp) ;
//			fprintf( emitter_offset_fp, "code_block_addr: %p code_block_size: %4d\n", code_block, code_size);			
			for ( unsigned i = 0; i < emitter.offset_buf_offset; i++){
				unsigned cnt_addr;
				cnt_addr = offset_buf[i*2+1];
/*
				if ( i == 0){
					fprintf( emitter_offset_fp, "code_block_addr: %p code_block_size: %4d ", code_block, code_size);					
					fprintf( emitter_offset_fp, "BB_offset: %p BB_size: %4d counter_addr: %p count: %I64u\n", 0, offset_buf[i*2], cnt_addr - 8, *(uint64*)cnt_addr);
				}

				if ( i < emitter.offset_buf_offset - 1){
					fprintf( emitter_offset_fp, "code_block_addr: %p code_block_size: %4d ", code_block, code_size);					
					fprintf( emitter_offset_fp, "BB_offset: %p BB_size: %4d counter_addr: %p count: %I64u\n", offset_buf[i*2], offset_buf[i*2+2] - offset_buf[i*2], cnt_addr, *(uint64*)cnt_addr);
				}else{
					fprintf( emitter_offset_fp, "code_block_addr: %p code_block_size: %4d ", code_block, code_size);					
					fprintf( emitter_offset_fp, "BB_offset: %p BB_size: %4d counter_addr: %p count: %I64u\n", offset_buf[i*2], code_size - offset_buf[i*2], cnt_addr, *(uint64*)cnt_addr);
				}
*/
			}
            emitter_native_code(method_handle, first_bc, code_len, map, prepass,
                method_name, code_block, code_size);
//			fprintf( emitter_offset_fp, "code_size%d\n", code_size);
			fflush( emitter_offset_fp);
			//
			// reset the offset_buf!
			//
			if(inner_statistics)
				((PROF_COUNTER*)&((unsigned short*)&emitter.prof_rec->back_edge[emitter.prof_rec->n_back_edge])[emitter.prof_rec->n_back_edge])[emitter.inner_bb_cnt_offset] = __UINT64_C(0xFFFFFFFFFFFFFFFF) ;
			emitter.offset_buf = NULL ;
			emitter.offset_buf_offset = 0 ;
			//emitter.prof_rec = NULL ;
		}


    }
    
#ifdef STACK_TRACE_ADDITIONAL_INFO

	unsigned lnlen = method_get_line_number_table_size(method_handle);
	for (i = 0; i < lnlen ; i++) {
		unsigned start_pc, line_number;
		method_get_line_number(method_handle, i, &start_pc, &line_number);
		method_set_nativecode_offset_for_line(method_handle, i, map[start_pc].offset);
    }

#endif

#ifdef VTune_Support
#ifdef _DEBUG
	{   
	char buf[512];
	int lineNo=0;  lineInfo_t *lineInfo=NULL;
	settle_classpath();
	char *source_filename =
		java_sourcefile(buf,sizeof(buf),(char*)class_name,
		(char*)method_name,code_len,&lineNo,&lineInfo);
/*
	cout << " --- " << class_name << " -> " << source_filename << endl;
	cout << "     " << method_name << " len = " << code_len << endl;
	for (int i=0; i<lineNo; i++)
		cout << "    line: " << lineInfo[i].line_number <<
		": " << lineInfo[i].start_pc << endl;
*/
	free(lineInfo);
    }
#endif  // _DEBUG
    if (code_block != NULL && VTuneModeFlags) {
        char buf[512], *source_filename = NULL;
        int lineNo=0;  lineInfo_t *lineInfo=NULL;
        if (java_sourcefile(buf,sizeof(buf),(char*)class_name,
            (char*)method_name,code_len,&lineNo,&lineInfo
            )) {
            source_filename = strdup(buf);
//			cout << class_name << " -> " << source_filename << "\n";
        }
        mInfo->method_id = (unsigned long)method_handle;
        mInfo->method_name = (char*)method_name;
        mInfo->method_load_address = (unsigned long)code_block;
        mInfo->method_size = code_size;
        mInfo->class_id = (unsigned long)class_handle;
        mInfo->class_file_name = (char*)class_name;
        mInfo->source_file_name = source_filename;
        
        mInfo->line_number_size = lineNo;
        LineNumberInfo *lines = (mInfo->line_number_size==0)? NULL :
        (LineNumberInfo *)malloc(sizeof(LineNumberInfo) *
            ((int)mInfo->line_number_size));
        // We should keep the memory needed for lines until we notify the
        // profiler.
        

        mInfo->line_number_table = lines;
        
        LineNumberInfo *pl = lines;
        for (unsigned long i=0; i<mInfo->line_number_size; i++) {
            unsigned start_pc, line_number;
            start_pc = lineInfo[i].start_pc;
            line_number = lineInfo[i].line_number;
            pl->LineNumber = line_number;
            pl->Offset = map[start_pc].offset;
            pl++;
        }
        free(lineInfo);

    }
#endif // VTune_Support
    
    return JIT_SUCCESS; // for now!
}

static int partition(Call_Site_Info *A[], int p, int r) {
    Call_Site_Info *x = A[(p+r)>>1];
    int i = p - 1;
    int j = r + 1;
    while (true) {
        do {j = j - 1;} while (A[j]->ret_IP > x->ret_IP);
        do {i = i + 1;} while (A[i]->ret_IP < x->ret_IP);
        if ( i < j) {
            Call_Site_Info *tmp;
            tmp = A[i];	A[i] = A[j]; A[j] = tmp;
        } else
            return j;
    }
}
void qsort(Call_Site_Info *A[], int p, int r) {
    if (p < r) {
        int q = partition(A,p,r);
        qsort(A,p,q);
        qsort(A,q+1,r);
    }
}
//
// use binary search to find the corresponding call site info 
//
int binary_search_call_site(Call_Site_Info *cs_info[], 
                            int num_call_sites,unsigned call_IP) {
    int low = 0, up = num_call_sites - 1;
    while (low <= up) {
        unsigned mid = (low + up) / 2;
        if (cs_info[mid]->call_IP > call_IP) 
            up = mid - 1;
        else if (cs_info[mid]->call_IP == call_IP)
            return mid;
        else  // bvle->id <  call_bc
            low = mid + 1;
    }
    return -1;
}
