Fixes for non-returning functions and tail calls:
[dyninst.git] / parseAPI / src / Parser.C
1 /*
2  * See the dyninst/COPYRIGHT file for copyright information.
3  * 
4  * We provide the Paradyn Tools (below described as "Paradyn")
5  * on an AS IS basis, and do not warrant its validity or performance.
6  * We reserve the right to update, modify, or discontinue this
7  * software at any time.  We shall have no obligation to supply such
8  * updates or modifications or any other form of support to you.
9  * 
10  * By your use of Paradyn, you understand and agree that we (or any
11  * other person or entity with proprietary rights in Paradyn) are
12  * under no obligation to provide either maintenance services,
13  * update services, notices of latent defects, or correction of
14  * defects for Paradyn.
15  * 
16  * This library is free software; you can redistribute it and/or
17  * modify it under the terms of the GNU Lesser General Public
18  * License as published by the Free Software Foundation; either
19  * version 2.1 of the License, or (at your option) any later version.
20  * 
21  * This library is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24  * Lesser General Public License for more details.
25  * 
26  * You should have received a copy of the GNU Lesser General Public
27  * License along with this library; if not, write to the Free Software
28  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
29  */
30
31 #include <vector>
32 #include <limits>
33
34 #include "dyntypes.h"
35
36 #include "CodeObject.h"
37 #include "CFGFactory.h"
38 #include "ParseCallback.h"
39 #include "Parser.h"
40 #include "CFG.h"
41 #include "util.h"
42 #include "debug_parse.h"
43
44 #include <boost/tuple/tuple.hpp>
45
46 using namespace std;
47 using namespace Dyninst;
48 using namespace Dyninst::ParseAPI;
49 using namespace Dyninst::InstructionAPI;
50
51 typedef std::pair< Address, EdgeTypeEnum > edge_pair_t;
52 typedef vector< edge_pair_t > Edges_t;
53
54
55 namespace {
56     struct less_cr {
57      bool operator()(CodeRegion * x, CodeRegion * y) 
58      { 
59          return x->offset() < y->offset();
60      }
61     };
62 }
63
64 Parser::Parser(CodeObject & obj, CFGFactory & fact, ParseCallbackManager & pcb) :
65     _obj(obj),
66     _cfgfact(fact),
67     _pcb(pcb),
68     _parse_data(NULL),
69     num_delayedFrames(0),
70     _sink(NULL),
71     _parse_state(UNPARSED),
72     _in_parse(false),
73     _in_finalize(false)
74 {
75     // cache plt entries for fast lookup
76     const map<Address, string> & lm = obj.cs()->linkage();
77     map<Address, string>::const_iterator lit = lm.begin();
78     for( ; lit != lm.end(); ++lit) {
79         parsing_printf("Cached PLT entry %s (%lx)\n",lit->second.c_str(),lit->first);
80         plt_entries[lit->first] = lit->second;
81     }
82
83     if(obj.cs()->regions().empty()) {
84         parsing_printf("[%s:%d] CodeSource provides no CodeRegions"
85                        " -- unparesable\n",
86             FILE__,__LINE__);
87         _parse_state = UNPARSEABLE;
88         return;
89     }
90
91     // check whether regions overlap
92     vector<CodeRegion *> const& regs = obj.cs()->regions();
93     vector<CodeRegion *> copy(regs.begin(),regs.end());
94     sort(copy.begin(),copy.end(),less_cr());
95
96     // allocate a sink block -- region is arbitrary
97     _sink = _cfgfact._mksink(&_obj,copy[0]);
98
99     bool overlap = false;
100     CodeRegion * prev = copy[0], *cur = NULL;
101     for(unsigned i=1;i<copy.size();++i) {
102         cur = copy[i];
103         if(cur->offset() < prev->offset() + prev->length()) {
104             parsing_printf("Overlapping code regions [%lx,%lx) and [%lx,%lx)\n",
105                 prev->offset(),prev->offset()+prev->length(),
106                 cur->offset(),cur->offset()+cur->length());
107             overlap = true;
108             break;
109         }
110     }
111
112     if(overlap)
113         _parse_data = new OverlappingParseData(this,copy);
114     else
115         _parse_data = new StandardParseData(this);
116 }
117
118 ParseFrame::~ParseFrame()
119 {
120     cleanup(); // error'd frames still need cleanup
121 }
122
123 Parser::~Parser()
124 {
125     if(_parse_data)
126         delete _parse_data;
127
128     vector<ParseFrame *>::iterator fit = frames.begin();
129     for( ; fit != frames.end(); ++fit) 
130         delete *fit;
131     frames.clear();
132 }
133
134 void
135 Parser::add_hint(Function * f)
136 {
137     if(!_parse_data->findFunc(f->region(),f->addr()))
138         record_func(f);
139 }
140
141 void
142 Parser::parse()
143 {
144     parsing_printf("[%s:%d] parse() called on Parser %p with state %d\n",
145                    FILE__,__LINE__,this, _parse_state);
146
147     // For modification: once we've full-parsed once, don't do it again
148     if (_parse_state >= COMPLETE) return;
149
150     if(_parse_state == UNPARSEABLE)
151         return;
152
153     assert(!_in_parse);
154     _in_parse = true;
155
156     parse_vanilla();
157
158     // anything else by default...?
159
160     if(_parse_state < COMPLETE)
161         _parse_state = COMPLETE;
162     
163     _in_parse = false;
164     parsing_printf("[%s:%d] parsing complete for Parser %p with state %d\n", FILE__, __LINE__, this, _parse_state);
165 }
166
167 void
168 Parser::parse_at(
169     CodeRegion * region,
170     Address target,
171     bool recursive,
172     FuncSource src)
173 {
174     Function *f;
175     ParseFrame *pf;
176     vector<ParseFrame *> work;
177
178     parsing_printf("[%s:%d] entered parse_at([%lx,%lx),%lx)\n",
179         FILE__,__LINE__,region->low(),region->high(),target);
180
181     if(!region->contains(target)) {
182         parsing_printf("\tbad address, bailing\n");
183         return;
184     }
185
186     if(_parse_state < PARTIAL)
187         _parse_state = PARTIAL;
188
189     f = _parse_data->get_func(region,target,src);
190     if(!f) {
191         parsing_printf("   could not create function at %lx\n",target);
192         return;
193     }
194
195     ParseFrame::Status exist = _parse_data->frameStatus(region,target);
196     if(exist != ParseFrame::BAD_LOOKUP && exist != ParseFrame::UNPARSED) {
197         parsing_printf("   function at %lx already parsed, status %d\n",
198             target, exist);
199         return;
200     }
201
202     if(!(pf = _parse_data->findFrame(region,target))) {
203         pf = new ParseFrame(f,_parse_data);
204         init_frame(*pf);
205         frames.push_back(pf);
206         _parse_data->record_frame(pf);
207     } 
208
209     work.push_back(pf);
210     parse_frames(work,recursive);
211
212     // downgrade state if necessary
213     if(_parse_state > COMPLETE)
214         _parse_state = COMPLETE;
215
216 }
217
218 void
219 Parser::parse_at(Address target, bool recursive, FuncSource src)
220 {
221     CodeRegion * region = NULL;
222
223     parsing_printf("[%s:%d] entered parse_at(%lx)\n",FILE__,__LINE__,target);
224
225     if(_parse_state == UNPARSEABLE)
226         return;
227
228     StandardParseData * spd = dynamic_cast<StandardParseData *>(_parse_data);
229     if(!spd) {
230         parsing_printf("   parse_at is invalid on overlapping regions\n");
231         return;
232     }
233
234     region = spd->reglookup(region,target); // input region ignored for SPD
235     if(!region) {
236         parsing_printf("   failed region lookup at %lx\n",target);
237         return;
238     }
239
240     parse_at(region,target,recursive,src);
241 }
242
243 void
244 Parser::parse_vanilla()
245 {
246     ParseFrame *pf;
247     vector<ParseFrame *> work;
248     vector<Function *>::iterator fit;
249
250     parsing_printf("[%s:%d] entered parse_vanilla()\n",FILE__,__LINE__);
251     parsing_printf("\t%d function hints\n",hint_funcs.size());
252
253     if(_parse_state < PARTIAL)
254         _parse_state = PARTIAL;
255     else
256         parsing_printf("\tparse state is %d, some parsing already done\n",
257             _parse_state);
258
259     /* Initialize parse frames from hints */
260     for(fit=hint_funcs.begin();fit!=hint_funcs.end();++fit) {
261         Function * hf = *fit;
262         ParseFrame::Status test = frame_status(hf->region(),hf->addr());
263         if(test != ParseFrame::BAD_LOOKUP &&
264            test != ParseFrame::UNPARSED)
265         {
266             parsing_printf("\tskipping repeat parse of %lx [%s]\n",
267                 hf->addr(),hf->name().c_str());
268             continue;
269         }
270
271         pf = new ParseFrame(hf,_parse_data);
272         init_frame(*pf);
273         frames.push_back(pf);
274         work.push_back(pf);
275         _parse_data->record_frame(pf);
276     }
277
278     parse_frames(work,true);
279 }
280
281 void
282 Parser::parse_edges( vector< ParseWorkElem * > & work_elems )
283 {
284     if(_parse_state == UNPARSEABLE)
285         return;
286
287     // build up set of needed parse frames and load them with work elements
288     set<ParseFrame*> frameset; // for dup checking
289     vector<ParseFrame*> frames;
290
291     for (unsigned idx=0; idx < work_elems.size(); idx++) {
292
293         ParseWorkElem *elem = work_elems[idx];
294         Block *src = elem->edge()->src();
295
296         if (elem->order() == ParseWorkElem::call_fallthrough)
297         {
298             Edge *callEdge = NULL;
299             Block::edgelist trgs = src->targets();
300             for (Block::edgelist::iterator eit = trgs.begin(); 
301                  eit != trgs.end(); 
302                  eit++) 
303             {
304                 if ((*eit)->type() == CALL) {
305                     callEdge = *eit;
306                     if (!(*eit)->sinkEdge()) // if it's a sink edge, look for nonsink CALL
307                         break; 
308                 }
309             }
310             // create a call work elem so that the bundle is complete
311             // and set the target function's return status and 
312             // tamper to RETURN and TAMPER_NONE, respectively
313             //assert(callEdge);
314
315             // When we have a direct call to unallocated memory or garbage
316             // we there is no call edge when the call is first executed. I'm
317             // not sure why not, so I'm attempting to continue past here without
318             // fixing up the called function...
319             // Also, in the case I saw, the callee was _not_ returning. Not directly. It 
320             // led into a big case with a longjmp() equivalent. 
321
322             if (callEdge) 
323             {
324                 bool isResolvable = false;
325                 Address callTarget = 0;
326                 if ( ! callEdge->sinkEdge() ) 
327                 {
328                     isResolvable = true;
329                     callTarget = callEdge->trg()->start();
330                     // the call target may be in another Code Object
331                     Function *callee = callEdge->trg()->obj()->findFuncByEntry(
332                         callEdge->trg()->region(), callTarget);
333                     assert(callee);
334                     callee->set_retstatus(RETURN);
335                     callee->_tamper = TAMPER_NONE;
336                 }
337                 elem->bundle()->add(new ParseWorkElem
338                     ( elem->bundle(), 
339                     callEdge,
340                     callTarget,
341                     isResolvable,
342                     false ));
343             }
344         }
345         ParseFrame *frame = _parse_data->findFrame
346             ( src->region(), 
347               src->lastInsnAddr() );
348         bool isNewFrame = false;
349         if (!frame) {
350             vector<Function*> funcs;
351             src->getFuncs(funcs);
352             frame = new ParseFrame(*funcs.begin(),_parse_data);
353             for (unsigned fix=1; fix < funcs.size(); fix++) {
354                 // if the block is shared, all of its funcs need
355                 // to add the new edge
356                 funcs[fix]->_cache_valid = false;
357             }
358             isNewFrame = true;
359         }
360
361         // push before frame init so no seed is added
362         if (elem->bundle()) {
363             frame->work_bundles.push_back(elem->bundle());
364         }
365         frame->pushWork(elem);
366         if (isNewFrame) {
367             init_frame(*frame);
368         }
369
370         if (frameset.end() == frameset.find(frame)) {
371             frameset.insert(frame);
372             frames.push_back(frame);
373         }
374     }
375
376     // now parse
377     if(_parse_state < PARTIAL)
378         _parse_state = PARTIAL;
379     _in_parse = true;
380
381     parse_frames( frames, true );
382
383     if(_parse_state > COMPLETE)
384         _parse_state = COMPLETE;
385     _in_parse = false;
386
387     finalize();
388
389 }
390
391 void
392 Parser::parse_frames(vector<ParseFrame *> & work, bool recursive)
393 {
394     ParseFrame * pf;
395
396     /* Recursive traversal parsing */ 
397     while(!work.empty()) {
398         
399         pf = work.back();
400         work.pop_back();
401
402         if(pf->status() == ParseFrame::PARSED)
403             continue;
404
405         parse_frame(*pf,recursive);
406         switch(pf->status()) {
407             case ParseFrame::CALL_BLOCKED: {
408                 parsing_printf("[%s] frame %lx blocked at %lx\n",
409                     FILE__,pf->func->addr(),pf->curAddr);
410
411                 assert(pf->call_target);        
412
413                 parsing_printf("    call target %lx\n",pf->call_target->addr());
414                 work.push_back(pf);
415     
416                 CodeRegion * cr = pf->call_target->region();
417                 Address targ = pf->call_target->addr();
418
419                 ParseFrame * tf = _parse_data->findFrame(cr,targ);
420                 if(!tf)
421                 {
422                     // sanity
423                     if(_parse_data->frameStatus(cr,targ) == ParseFrame::PARSED)
424                         assert(0);
425
426                     tf = new ParseFrame(pf->call_target,_parse_data);
427                     init_frame(*tf);
428                     frames.push_back(tf);
429                     _parse_data->record_frame(tf);
430                 }
431                 if(likely(recursive))
432                     work.push_back(tf);
433                 else {
434                     assert(0);
435                     // XXX should never get here
436                     //parsing_printf("    recursive parsing disabled\n");
437                 }
438
439                 break;
440             }
441             case ParseFrame::PARSED:
442                 parsing_printf("[%s] frame %lx complete, return status: %d\n",
443                     FILE__,pf->func->addr(),pf->func->_rs);
444
445                 if (unlikely( _obj.defensiveMode() && 
446                               TAMPER_NONE != pf->func->tampersStack() &&
447                               TAMPER_NONZERO != pf->func->tampersStack() ))
448                 {   // may adjust CALL_FT targets, add a frame to the worklist,
449                     // or trigger parsing in a separate target CodeObject
450                     tamper_post_processing(work,pf);
451                 }
452                 
453                 /* add waiting frames back onto the worklist */
454                 resumeFrames(pf->func, work);
455                 
456                 pf->cleanup();
457                 break;
458             case ParseFrame::FRAME_ERROR:
459                 parsing_printf("[%s] frame %lx error at %lx\n",
460                     FILE__,pf->func->addr(),pf->curAddr);
461                 break;
462             case ParseFrame::FRAME_DELAYED: {
463                 parsing_printf("[%s] frame %lx delayed at %lx\n",
464                         FILE__,
465                         pf->func->addr(),
466                        pf->curAddr);
467                 
468                 if (pf->delayedWork.size()) {
469                     // Add frame to global delayed list
470                     map<ParseWorkElem *, Function *>::iterator iter;
471                     map<Function *, set<ParseFrame *> >::iterator fIter;
472                     for (iter = pf->delayedWork.begin();
473                             iter != pf->delayedWork.end();
474                             ++iter) {
475
476                         Function * ct = iter->second;
477                         parsing_printf("[%s] waiting on %s\n",
478                                 __FILE__,
479                                 ct->name().c_str());
480
481                         fIter = delayedFrames.find(ct);
482                         if (fIter == delayedFrames.end()) {
483                             std::set<ParseFrame *> waiters;
484                             waiters.insert(pf);
485                             delayedFrames[ct] = waiters;
486                         } else {
487                             delayedFrames[ct].insert(pf);
488                         }
489                     }
490                 } else {
491                     // We shouldn't get here
492                     assert(0 && "Delayed frame with no delayed work");
493                 }
494                 
495                 /* if the return status of this function has been updated, add
496                  * waiting frames back onto the work list */
497                 resumeFrames(pf->func, work);
498                 break;
499             }
500             default:
501                 assert(0 && "invalid parse frame status");
502         }
503     }
504
505     // Use fixed-point to ensure we parse frames whose parsing had to be delayed
506     if (delayedFrames.size() == 0) {
507         // Done!
508         parsing_printf("[%s] Fixed point reached (0 funcs with unknown return status)\n)",
509                 __FILE__);
510         num_delayedFrames = 0;
511     } else if (delayedFrames.size() == num_delayedFrames) {
512         // If we've reached a fixedpoint and have remaining frames, we must
513         // have a cyclic dependency
514         parsing_printf("[%s] Fixed point reached (%d funcs with unknown return status)\n", 
515                 __FILE__, 
516                 delayedFrames.size());
517
518         // Mark UNSET functions in cycle as NORETURN
519         // except if we're doing non-recursive parsing.
520         // If we're just parsing one function, we want
521         // to mark everything RETURN instead.
522         map<Function *, set<ParseFrame *> >::iterator iter;
523         vector<Function *> updated;
524         for (iter = delayedFrames.begin(); iter != delayedFrames.end(); ++iter) {
525             Function * func = iter->first;
526             if (func->retstatus() == UNSET) {
527               if(recursive) 
528               {
529                 func->set_retstatus(NORETURN);
530                 func->obj()->cs()->incrementCounter(PARSE_NORETURN_HEURISTIC);
531               }
532               else
533               {
534                 func->set_retstatus(RETURN);
535               }
536               updated.push_back(func);
537             } 
538
539             set<ParseFrame *> vec = iter->second;
540             set<ParseFrame *>::iterator vIter;
541             for (vIter = vec.begin(); vIter != vec.end(); ++vIter) {
542                 Function * delayed = (*vIter)->func;
543                 if (delayed->retstatus() == UNSET) {
544                     delayed->set_retstatus(NORETURN);
545                     delayed->obj()->cs()->incrementCounter(PARSE_NORETURN_HEURISTIC);
546                     updated.push_back(func);
547                 }
548             }
549         }
550
551         // We should have updated the return status of one or more frames; recurse
552         if (updated.size()) {
553             for (vector<Function *>::iterator uIter = updated.begin();
554                     uIter != updated.end();
555                     ++uIter) {
556                 resumeFrames((*uIter), work);
557             }
558
559             if (work.size()) {
560                 parsing_printf("[%s] Updated retstatus of delayed frames, trying again; work.size() = %d\n",
561                         __FILE__,
562                         work.size());
563                 parse_frames(work, recursive);
564             }
565         } else {
566             // We shouldn't get here
567             parsing_printf("[%s] No more work can be done (%d funcs with unknown return status)\n",
568                     __FILE__, 
569                     delayedFrames.size());
570             assert(0);
571         }
572     } else {
573         // We haven't yet reached a fixedpoint; let's recurse
574         parsing_printf("[%s] Fixed point not yet reached (%d funcs with unknown return status)\n", 
575                 __FILE__,
576                 delayedFrames.size());
577
578         // Update num_delayedFrames for next iteration 
579         num_delayedFrames = delayedFrames.size();
580
581         // Check if we can resume any frames yet
582         vector<Function *> updated;
583         for (map<Function *, set<ParseFrame *> >::iterator iter = delayedFrames.begin(); 
584                 iter != delayedFrames.end(); 
585                 ++iter) {
586             if (iter->first->_rs != UNSET) {
587                 updated.push_back(iter->first);
588             }
589         }
590
591         if (updated.size()) {
592             for (vector<Function *>::iterator uIter = updated.begin();
593                     uIter != updated.end();
594                     ++uIter) {
595                 resumeFrames((*uIter), work);
596             }
597         }
598
599         // Recurse through parse_frames        
600         parsing_printf("[%s] Calling parse_frames again, work.size() = %d\n", 
601                 __FILE__,
602                 work.size());
603         parse_frames(work, recursive);
604     }
605
606     for(unsigned i=0;i<frames.size();++i) {
607         _parse_data->remove_frame(frames[i]);
608         delete frames[i];
609     }
610     frames.clear();
611 }
612
613 /* Finalizing all functions for consumption:
614   
615    - Finish delayed parsing
616    - Prepare and record FuncExtents for range-based lookup
617 */
618
619 // Could set cache_valid depending on whether the function is currently
620 // being parsed somewhere. 
621
622 void
623 Parser::finalize(Function *f)
624 {
625     if(f->_cache_valid) {
626         return;
627     }
628
629     if(!f->_parsed) {
630         parsing_printf("[%s:%d] Parser::finalize(f[%lx]) "
631                        "forced parsing\n",
632             FILE__,__LINE__,f->addr());
633         parse();
634     }
635
636         bool cache_value = true;
637         /* this is commented out to prevent a failure in tampersStack, but
638            this may be an incorrect approach to fixing the problem.
639         if(frame_status(f->region(), f->addr()) < ParseFrame::PARSED) {
640                 // XXX prevent caching of blocks, extents for functions that
641                 // are actively being parsed. This prevents callbacks and other
642                 // functions called from within, e.g. parse_frame from setting
643                 // the caching flag and preventing later updates to the blocks()
644                 // vector during finalization.
645                 cache_value = false;
646         }*/
647
648     parsing_printf("[%s] finalizing %s (%lx)\n",
649         FILE__,f->name().c_str(),f->addr());
650
651     region_data * rd = _parse_data->findRegion(f->region());
652     assert(rd);
653
654     // finish delayed parsing and sorting
655     Function::blocklist blocks = f->blocks_int();
656
657     // is this the first time we've parsed this function?
658     if (unlikely( !f->_extents.empty() )) {
659         _parse_data->remove_extents(f->_extents);
660         f->_extents.clear();
661     }
662     
663
664     if(blocks.empty()) {
665         f->_cache_valid = cache_value; // see above
666         return;
667     }
668     
669     auto bit = blocks.begin();
670     FuncExtent * ext = NULL;
671     Address ext_s = (*bit)->start();
672     Address ext_e = ext_s;
673
674     for( ; bit != blocks.end(); ++bit) {
675         Block * b = *bit;
676         if(b->start() > ext_e) {
677             ext = new FuncExtent(f,ext_s,ext_e);
678             parsing_printf("%lx extent [%lx,%lx)\n",f->addr(),ext_s,ext_e);
679             f->_extents.push_back(ext);
680             rd->funcsByRange.insert(ext);
681             ext_s = b->start();
682         }
683         ext_e = b->end();
684     }
685     ext = new FuncExtent(f,ext_s,ext_e);
686     parsing_printf("%lx extent [%lx,%lx)\n",f->addr(),ext_s,ext_e);
687     rd->funcsByRange.insert(ext);
688     f->_extents.push_back(ext);
689
690     f->_cache_valid = cache_value; // see comment at function entry
691
692     if (unlikely( f->obj()->defensiveMode())) {
693         // add fallthrough edges for calls assumed not to be returning
694         // whose fallthrough blocks we parsed anyway (this happens if 
695         // the FT block is also a branch target)
696         Function::edgelist & edges = f->_call_edge_list;
697         for (Function::edgelist::iterator eit = edges.begin();
698              eit != edges.end(); 
699              eit++)
700         {
701             if (2 > (*eit)->src()->targets().size()) {
702                 Block *ft = _parse_data->findBlock((*eit)->src()->region(),
703                                                    (*eit)->src()->end());
704                 if (ft && HASHDEF(f->_bmap,ft->start())) {
705                     link((*eit)->src(),ft,CALL_FT,false);
706                 }
707             }
708         }
709     }
710 }
711
712 void
713 Parser::finalize()
714 {
715     assert(!_in_finalize);
716     _in_finalize = true;
717
718     if(_parse_state < FINALIZED) {
719         finalize_funcs(hint_funcs);
720         finalize_funcs(discover_funcs);
721         _parse_state = FINALIZED;
722     }
723
724     _in_finalize = false;
725 }
726
727 void
728 Parser::finalize_funcs(vector<Function *> &funcs)
729 {
730     vector<Function *>::iterator fit = funcs.begin();
731     for( ; fit != funcs.end(); ++fit) {
732         finalize(*fit);
733     } 
734 }
735
736 void
737 Parser::record_func(Function *f) {
738     if(!f) return;
739
740     if(f->src() == HINT)
741         hint_funcs.push_back(f);
742     else
743         discover_funcs.push_back(f);
744
745     sorted_funcs.insert(f);
746
747     _parse_data->record_func(f);
748 }
749
750 void
751 Parser::init_frame(ParseFrame & frame)
752 {
753    Block * b = NULL;
754     Block * split = NULL;
755
756     if ( ! frame.func->_entry ) 
757     {
758         // Find or create a block
759         b = block_at(frame.func, frame.func->addr(),split);
760         if(b) {
761             frame.leadersToBlock[frame.func->addr()] = b;
762             frame.func->_entry = b;
763             frame.seed = new ParseWorkElem(NULL,NULL,frame.func->addr(),true,false);
764             frame.pushWork(frame.seed);
765         } else {
766             parsing_printf("[%s] failed to initialize parsing frame\n",
767                 FILE__);
768             return;
769         }
770         if (split) {
771             _pcb.splitBlock(split,b);
772         }
773     }
774
775     // FIXME these operations should move into the actual parsing
776     Address ia_start = frame.func->addr();
777     unsigned size = 
778      frame.codereg->offset() + frame.codereg->length() - ia_start;
779     const unsigned char* bufferBegin =
780      (const unsigned char *)(frame.func->isrc()->getPtrToInstruction(ia_start));
781     InstructionDecoder dec(bufferBegin,size,frame.codereg->getArch());
782     InstructionAdapter_t ah(dec, ia_start, frame.func->obj(),
783         frame.codereg, frame.func->isrc(), b);
784         if(ah.isStackFramePreamble()) {
785         frame.func->_no_stack_frame = false;
786         }
787     frame.func->_saves_fp = ah.savesFP();
788 }
789
790 void ParseFrame::cleanup()
791 {
792     for(unsigned i=0;i<work_bundles.size();++i)
793         delete work_bundles[i];
794     work_bundles.clear();
795     if(seed)
796         delete seed;
797     seed = NULL;
798 }
799
800 namespace {
801     inline Edge * bundle_call_edge(ParseWorkBundle * b)
802     {
803         if(!b) return NULL;
804
805         vector<ParseWorkElem*> const& elems = b->elems();
806         vector<ParseWorkElem*>::const_iterator it = elems.begin();
807         for( ; it != elems.end(); ++it) {
808             if((*it)->edge()->type() == CALL)
809                 return (*it)->edge();
810         }
811         return NULL;
812     }
813
814     /* 
815      * Look up the next block for detection of straight-line
816      * fallthrough edges into existing blocks.
817      */
818     inline std::pair<Address, Block*> get_next_block(
819         Address addr,
820         CodeRegion * codereg, 
821         ParseData * _parse_data)
822     {
823         Block * nextBlock = NULL;
824         Address nextBlockAddr;
825
826         nextBlockAddr = numeric_limits<Address>::max();
827         region_data * rd = _parse_data->findRegion(codereg);
828
829         if((nextBlock = rd->blocksByRange.successor(addr)) &&
830            nextBlock->start() > addr)
831         {
832             nextBlockAddr = nextBlock->start();   
833         }
834
835         return std::pair<Address,Block*>(nextBlockAddr,nextBlock);
836     }
837 }
838
839 void
840 Parser::parse_frame(ParseFrame & frame, bool recursive) {
841     /** Persistent intermediate state **/
842     boost::shared_ptr<InstructionAdapter_t> ahPtr;
843     ParseFrame::worklist_t & worklist = frame.worklist;
844     dyn_hash_map<Address, Block *> & leadersToBlock = frame.leadersToBlock;
845     Address & curAddr = frame.curAddr;
846     Function * func = frame.func;
847     dyn_hash_map<Address, bool> & visited = frame.visited;
848     unsigned & num_insns = frame.num_insns;
849     func->_cache_valid = false;
850
851     /** Non-persistent intermediate state **/
852     Address nextBlockAddr;
853     Block * nextBlock;
854
855     if (frame.status() == ParseFrame::UNPARSED) {
856         parsing_printf("[%s] ==== starting to parse frame %lx ====\n",
857             FILE__,frame.func->addr());
858         // prevents recursion of parsing
859         frame.func->_parsed = true;
860     } else {
861         parsing_printf("[%s] ==== resuming parse of frame %lx ====\n",
862             FILE__,frame.func->addr());
863         // Pull work that can be resumed off the delayedWork list
864         std::map<ParseWorkElem *, Function *>::iterator iter;
865         
866         vector<ParseWorkElem *> updated;
867         vector<ParseWorkElem *>::iterator uIter;
868
869         for (iter = frame.delayedWork.begin();
870                 iter != frame.delayedWork.end();
871                 ++iter) {
872             if (iter->second->_rs != UNSET) {
873                 frame.pushWork(iter->first);
874                 updated.push_back(iter->first);
875             }
876         }
877         for (uIter = updated.begin(); uIter != updated.end(); ++uIter) {
878             frame.delayedWork.erase(*uIter);
879         }
880     }
881
882
883     frame.set_status(ParseFrame::PROGRESS);
884
885     while(!worklist.empty()) {
886         
887         Block * cur = NULL;
888         ParseWorkElem * work = frame.popWork();
889         if (work->order() == ParseWorkElem::call) {
890             Function * ct = NULL;
891             Edge * ce = NULL;
892
893             if (!work->callproc()) {
894               // If we're not doing recursive traversal, skip *all* of the call edge processing.
895               // We don't want to create a stub. We'll handle the assumption of a fallthrough
896               // target for the fallthrough work element, not the call work element. Catch blocks
897               // will get excluded but that's okay; our heuristic is not reliable enough that I'm willing to deploy
898               // it when we'd only be detecting a non-returning callee by name anyway.
899               // --BW 12/2012
900                                 if (!recursive) {
901                                     parsing_printf("[%s] non-recursive parse skipping call %lx->%lx\n",
902                                                                    FILE__, work->edge()->src()->lastInsnAddr(), work->target());
903                                         continue;
904                                 }
905               
906                 parsing_printf("[%s] binding call %lx->%lx\n",
907                                FILE__,work->edge()->src()->lastInsnAddr(),work->target());
908             
909                 pair<Function*,Edge*> ctp =
910                     bind_call(frame,
911                               work->target(),
912                               work->edge()->src(),
913                               work->edge());
914                 ct = ctp.first;
915                 ce = ctp.second;
916
917                 work->mark_call();
918             } else {
919                 ct = _parse_data->findFunc(frame.codereg,work->target());
920                 ce = work->edge();
921             }
922
923             if (recursive && ct &&
924                (frame_status(ct->region(),ct->addr())==ParseFrame::UNPARSED || 
925                 frame_status(ct->region(),ct->addr())==ParseFrame::BAD_LOOKUP)) {
926                 // suspend this frame and parse the next
927                 parsing_printf("    [suspend frame %lx]\n", func->addr()); 
928                 frame.call_target = ct;
929                 frame.set_status(ParseFrame::CALL_BLOCKED);
930                 // need to re-visit this edge
931                 frame.pushWork(work);
932                 return;
933             } else if (ct && work->tailcall()) {
934                 // XXX The target has been or is currently being parsed (else
935                 //     the previous conditional would have been taken),
936                 //     so if its return status is unset then this
937                 //     function has to take UNKNOWN
938                 if (func->_rs != RETURN) {
939                     if (ct->_rs > NORETURN)
940                       func->set_retstatus(ct->_rs);
941                     else if (ct->_rs == UNSET)
942                       func->set_retstatus(UNKNOWN);
943                 }
944             }
945
946             // check for catch blocks after non-returning calls
947             if (ct && ct->_rs == NORETURN) {
948                 Address catchStart;
949                 Block * caller = ce->src();
950                 if (frame.codereg->findCatchBlock(caller->end(),catchStart)) {
951                     parsing_printf("[%s] found post-return catch block %lx\n",
952                         FILE__,catchStart);
953
954                     // make an edge
955                     Edge * catch_edge = link_tempsink(caller,CATCH);
956                 
957                     // push on worklist
958                     frame.pushWork(
959                         frame.mkWork(
960                             work->bundle(),
961                             catch_edge,
962                             catchStart,
963                             true,
964                             false));
965                 }
966             }
967     
968             continue;
969         } else if (work->order() == ParseWorkElem::call_fallthrough) {
970             // check associated call edge's return status
971             Edge * ce = bundle_call_edge(work->bundle());
972             if (!ce) {
973                 // odd; no call edge in this bundle
974                 parsing_printf("[%s] unexpected missing call edge at %lx\n",
975                         FILE__,work->edge()->src()->lastInsnAddr());
976             } else {
977                 // check for system call FT
978                 Edge* edge = work->edge();
979                 Block::Insns blockInsns;
980                 edge->src()->getInsns(blockInsns);
981                 auto prev = blockInsns.rbegin();
982                 InstructionAPI::InstructionPtr prevInsn = prev->second;
983                 if (prevInsn->getOperation().getID() == e_syscall) {
984                     Address src = edge->src()->lastInsnAddr();
985
986                     bool is_nonret = false;
987
988                     // Need to determine if system call is non-returning
989                     long int syscallNumber;
990                     if (!getSyscallNumber(func, edge->src(), src, frame.codereg->getArch(), syscallNumber)) {
991                         // If we cannot retrieve a syscall number, assume the syscall returns
992                         parsing_printf("[%s] could not retrieve syscall edge, assuming returns\n", FILE__);
993                     } else {
994                         if (obj().cs()->nonReturningSyscall(syscallNumber)) {
995                             is_nonret = true;
996                         }
997                     }
998
999                     if (is_nonret) {
1000                         parsing_printf("[%s] no fallthrough for non-returning syscall\n",
1001                                 FILE__,
1002                                 work->edge()->src()->lastInsnAddr());
1003
1004                         // unlink tempsink fallthrough edge
1005                         Edge * remove = work->edge();
1006                         remove->src()->removeTarget(remove);
1007                         factory().destroy_edge(remove);
1008                         continue;
1009                     }
1010                 } else {
1011                     Address target = ce->trg()->start();
1012                     Function * ct = _parse_data->findFunc(frame.codereg,target);
1013                     bool is_plt = false;
1014                     bool is_nonret = false;
1015
1016                     // check if associated call edge's return status is still unknown
1017                     if (ct && (ct->_rs == UNSET) ) {
1018                         // Delay parsing until we've finished the corresponding call edge
1019                         parsing_printf("[%s] Parsing FT edge %lx, corresponding callee (%s) return status unknown; delaying work\n",
1020                                 __FILE__,
1021                                 work->edge()->src()->lastInsnAddr(),
1022                                 ct->name().c_str());
1023
1024                         // Add work to ParseFrame's delayed list
1025                         frame.pushDelayedWork(work, ct);
1026
1027                         // Continue other work for this frame
1028                         continue; 
1029                     }
1030
1031                     is_plt = HASHDEF(plt_entries,target);
1032
1033                     // CodeSource-defined tests 
1034                     is_nonret = obj().cs()->nonReturning(target);
1035                     if (is_nonret) {
1036                         parsing_printf("\t Disallowing FT edge: CodeSource reports nonreturning\n");
1037                     }
1038                     if (!is_nonret && is_plt) {
1039                         is_nonret |= obj().cs()->nonReturning(plt_entries[target]);
1040                         if (is_nonret) {
1041                             parsing_printf("\t Disallowing FT edge: CodeSource reports PLT nonreturning\n");
1042                         }
1043                     }
1044                     // Parsed return status tests
1045                     if (!is_nonret && !is_plt && ct) {
1046                         is_nonret |= (ct->retstatus() == NORETURN);
1047                         if (is_nonret) {
1048                             parsing_printf("\t Disallowing FT edge: function is non-returning\n");
1049                         }
1050                     }
1051                     // Call-stack tampering tests
1052                     if (unlikely(!is_nonret && frame.func->obj()->defensiveMode() && ct)) {
1053                         is_nonret |= (ct->retstatus() == UNKNOWN);
1054                         if (is_nonret) {
1055                             parsing_printf("\t Disallowing FT edge: function in "
1056                                     "defensive binary may not return\n");
1057                             mal_printf("Disallowing FT edge: function %lx in "
1058                                     "defensive binary may not return\n", ct->addr());
1059                         } else {
1060                             StackTamper ct_tamper = ct->tampersStack();
1061                             is_nonret |= (TAMPER_NONZERO == ct_tamper);
1062                             is_nonret |= (TAMPER_ABS == ct_tamper);
1063                             if (is_nonret) {
1064                                 mal_printf("Disallowing FT edge: function at %lx "
1065                                         "tampers with its stack\n", ct->addr());
1066                                 parsing_printf("\t Disallowing FT edge: function "
1067                                         "tampers with its stack\n");
1068                             }
1069                         }
1070                     }
1071                     if (is_nonret) {
1072                         parsing_printf("[%s] no fallthrough for non-returning call "
1073                                 "to %lx at %lx\n",FILE__,target,
1074                                 work->edge()->src()->lastInsnAddr());
1075
1076                         // unlink tempsink fallthrough edge
1077                         Edge * remove = work->edge();
1078                         remove->src()->removeTarget(remove);
1079                         factory().destroy_edge(remove);
1080                         continue;
1081                     }
1082
1083                     // Invalidate cache_valid for all sharing functions
1084                     invalidateContainingFuncs(func, ce->src());                
1085                 }
1086             }
1087         } else if (work->order() == ParseWorkElem::seed_addr) {
1088             cur = leadersToBlock[work->target()];
1089         } else if (work->order() == ParseWorkElem::resolve_jump_table) {
1090             // resume to resolve jump table 
1091             parsing_printf("... continue parse indirect jump at %lx\n", work->ah()->getAddr());
1092             Block *nextBlock = work->cur();
1093             if (nextBlock->last() != work->ah()->getAddr()) {
1094                 // The block has been split
1095                 region_data * rd = _parse_data->findRegion(frame.codereg);
1096                 set<Block*> blocks;
1097                 rd->blocksByRange.find(work->ah()->getAddr(), blocks);
1098                 for (auto bit = blocks.begin(); bit != blocks.end(); ++bit) {
1099                     if ((*bit)->last() == work->ah()->getAddr()) {
1100                         nextBlock = *bit;
1101                         break;
1102                     }
1103                 }
1104
1105             }
1106             ProcessCFInsn(frame,nextBlock,*work->ah());
1107             continue;
1108         }
1109         // call fallthrough case where we have already checked that
1110         // the target returns. this is used in defensive mode.
1111         else if (work->order() == ParseWorkElem::checked_call_ft) {
1112             Edge* ce = bundle_call_edge(work->bundle());
1113             if (ce != NULL) {
1114                 invalidateContainingFuncs(func, ce->src());
1115             } else {
1116                 parsing_printf("[%s] unexpected missing call edge at %lx\n",
1117                         FILE__,work->edge()->src()->lastInsnAddr());
1118             }
1119         }
1120         
1121         if (NULL == cur) {
1122             pair<Block*,Edge*> newedge =
1123                 add_edge(frame,
1124                          frame.func,
1125                          work->edge()->src(),
1126                          work->target(),
1127                          work->edge()->type(),
1128                          work->edge());
1129             cur = newedge.first;
1130         }
1131
1132         if (HASHDEF(visited,cur->start()))
1133         {
1134             parsing_printf("[%s] skipping locally parsed target at %lx\n",
1135                 FILE__,work->target());
1136             continue;
1137         } 
1138         visited[cur->start()] = true;
1139         leadersToBlock[cur->start()] = cur;
1140
1141         if (!cur->_parsed)
1142         {
1143             parsing_printf("[%s] parsing block %lx\n",
1144                 FILE__,cur->start());
1145             if (frame.func->obj()->defensiveMode()) {
1146                 mal_printf("new block at %lx (0x%lx)\n",cur->start(), cur);
1147             }
1148
1149             cur->_parsed = true;
1150             curAddr = cur->start();
1151         } else {
1152             parsing_printf("[%s] deferring parse of shared block %lx\n",
1153                 FILE__,cur->start());
1154             if (func->_rs < UNKNOWN) {
1155                 // we've parsed into another function, if we've parsed
1156                 // into it's entry point, set retstatus to match it
1157                 Function * other_func = _parse_data->findFunc(
1158                     func->region(), cur->start());
1159                 if (other_func && other_func->retstatus() > UNKNOWN) {
1160                   func->set_retstatus(other_func->retstatus());
1161                 } else {
1162                   func->set_retstatus(UNKNOWN);
1163                 }
1164             }
1165             continue;
1166         }
1167
1168         /*
1169          * External consumers of parsing may have invoked finalizing
1170          * methods from callback routines. Ensure that the caches
1171          * are invalidated because a new block is being added to the
1172          * function's view
1173          */
1174         func->_cache_valid = false;
1175
1176         // NB Using block's region() here because it may differ from the
1177         //    function's if control flow has jumped to another region
1178         unsigned size = 
1179             cur->region()->offset() + cur->region()->length() - curAddr;
1180         const unsigned char* bufferBegin =
1181           (const unsigned char *)(func->isrc()->getPtrToInstruction(curAddr));
1182         InstructionDecoder dec(bufferBegin,size,frame.codereg->getArch());
1183
1184         if (!ahPtr)
1185             ahPtr.reset(new InstructionAdapter_t(dec, curAddr, func->obj(), 
1186                         cur->region(), func->isrc(), cur));
1187         else
1188             ahPtr->reset(dec,curAddr,func->obj(),
1189                          cur->region(), func->isrc(), cur);
1190        
1191         InstructionAdapter_t & ah = *ahPtr; 
1192
1193         using boost::tuples::tie;
1194         tie(nextBlockAddr,nextBlock) = get_next_block(
1195             frame.curAddr, frame.codereg, _parse_data);
1196
1197         bool isNopBlock = ah.isNop();
1198
1199         while(true) {
1200             curAddr = ah.getAddr();
1201             /** Check for straight-line fallthrough **/
1202             if (curAddr == nextBlockAddr) {
1203                 parsing_printf("[%s] straight-line parse into block at %lx\n",
1204                     FILE__,curAddr);
1205                 if (frame.func->obj()->defensiveMode()) {
1206                     mal_printf("straight-line parse into block at %lx\n",curAddr);
1207                 }
1208                 ah.retreat();
1209                 curAddr = ah.getAddr();
1210
1211                 end_block(cur,ah);
1212                 pair<Block*,Edge*> newedge =
1213                     add_edge(frame,frame.func,cur,
1214                              nextBlockAddr,FALLTHROUGH,NULL);
1215         
1216                 if (!HASHDEF(visited,nextBlockAddr) &&
1217                    !HASHDEF(leadersToBlock,nextBlockAddr)) {
1218                     parsing_printf("[%s:%d] pushing %lx onto worklist\n",
1219                                    FILE__,__LINE__,nextBlockAddr);
1220
1221                     frame.pushWork(
1222                         frame.mkWork(
1223                             NULL,
1224                             newedge.second,
1225                             nextBlockAddr,
1226                             true,
1227                             false)
1228                     );
1229                     /* preserved as example of leaky code; use mkWork instead
1230                     frame.pushWork(
1231                         new ParseWorkElem(
1232                             NULL,
1233                             newedge.second,
1234                             nextBlockAddr,
1235                             true,
1236                             false)
1237                         );
1238                     */
1239                     leadersToBlock[nextBlockAddr] = nextBlock;
1240                 }
1241                 break;
1242             } else if (curAddr > nextBlockAddr) {
1243                 parsing_printf("[%s:%d] inconsistent instruction stream: "
1244                                "%lx is within [%lx,%lx)\n",
1245                     FILE__,__LINE__,curAddr,
1246                     nextBlock->start(),nextBlock->end());
1247
1248                 // NB "cur" hasn't ended, so its range may
1249                 // not look like it overlaps with nextBlock
1250                 _pcb.overlapping_blocks(cur,nextBlock);
1251
1252                 tie(nextBlockAddr,nextBlock) = 
1253                     get_next_block(frame.curAddr, frame.codereg, _parse_data);
1254             }
1255
1256             // per-instruction callback notification 
1257             ParseCallback::insn_details insn_det;
1258             insn_det.insn = &ah;
1259              
1260                 parsing_printf("[%s:%d] curAddr 0x%lx \n",
1261                     FILE__,__LINE__,curAddr);
1262
1263             if (func->_is_leaf_function) {
1264                 Address ret_addr;
1265                 func->_is_leaf_function = !(insn_det.insn->isReturnAddrSave(ret_addr));
1266                     parsing_printf("[%s:%d] leaf %d funcname %s \n",
1267                         FILE__,__LINE__,func->_is_leaf_function, func->name().c_str());
1268                 if (!func->_is_leaf_function) func->_ret_addr = ret_addr;       
1269             }
1270                 
1271             _pcb.instruction_cb(func,cur,curAddr,&insn_det);
1272
1273             if (isNopBlock && !ah.isNop()) {
1274                 ah.retreat();
1275
1276                 end_block(cur,ah);
1277                 pair<Block*,Edge*> newedge =
1278                     add_edge(frame,frame.func,cur,curAddr,FALLTHROUGH,NULL);
1279                 Block * targ = newedge.first;
1280   
1281                 parsing_printf("[%s:%d] nop-block ended at %lx\n",
1282                     FILE__,__LINE__,curAddr); 
1283                 if (targ && !HASHDEF(visited,targ->start())) {
1284                     parsing_printf("[%s:%d] pushing %lx onto worklist\n",
1285                         FILE__,__LINE__,targ->start());
1286
1287                     frame.pushWork(
1288                         frame.mkWork(
1289                             NULL,
1290                             newedge.second,
1291                             targ->start(),
1292                             true,
1293                             false)
1294                         );
1295                     leadersToBlock[targ->start()] = targ; 
1296                 }
1297                 break;
1298             }
1299             
1300             /** Particular instruction handling (calls, branches, etc) **/
1301             ++num_insns; 
1302
1303             if(ah.hasCFT()) {
1304 //             if (false) {
1305                if (ah.isIndirectJump()) {
1306                    // Create a work element to represent that
1307                    // we will resolve the jump table later
1308                    end_block(cur,ah);
1309                    frame.pushWork( frame.mkWork( work->bundle(), cur, ah));
1310                } else {
1311                    ProcessCFInsn(frame,cur,ah);
1312                }
1313                break;
1314             } else if (func->_saves_fp &&
1315                        func->_no_stack_frame &&
1316                        ah.isFrameSetupInsn()) { // isframeSetup is expensive
1317                func->_no_stack_frame = false;
1318             } else if (ah.isLeave()) {
1319                 func->_no_stack_frame = false;
1320             } else if ( ah.isAbort() ) {
1321                 // 4. `abort-causing' instructions
1322                 end_block(cur,ah);
1323                 //link(cur, _sink, DIRECT, true);
1324                 break; 
1325             } else if ( ah.isInvalidInsn() ) {
1326                 // 4. Invalid or `abort-causing' instructions
1327                 end_block(cur,ah);
1328                 link(cur, _sink, DIRECT, true);
1329                 break; 
1330             } else if ( ah.isInterruptOrSyscall() ) {
1331                 // 5. Raising instructions
1332                 end_block(cur,ah);
1333
1334                 pair<Block*,Edge*> newedge =
1335                     add_edge(frame,frame.func,cur,ah.getNextAddr(),FALLTHROUGH,NULL);
1336                 Block * targ = newedge.first;
1337    
1338                 if (targ && !HASHDEF(visited,targ->start()) &&
1339                             !HASHDEF(leadersToBlock,targ->start())) {
1340                     parsing_printf("[%s:%d] pushing %lx onto worklist\n",
1341                                    FILE__,__LINE__,targ->start());
1342
1343                     frame.pushWork(
1344                         frame.mkWork(
1345                             NULL,
1346                             newedge.second,
1347                             targ->start(),
1348                             true,
1349                             false)
1350                         );
1351                     leadersToBlock[targ->start()] = targ; 
1352                 }
1353                 if (unlikely(func->obj()->defensiveMode())) {
1354                     fprintf(stderr,"parsed bluepill insn sysenter or syscall "
1355                             "in defensive mode at %lx\n",curAddr);
1356                 }
1357                 break;
1358             } else if (unlikely(func->obj()->defensiveMode())) {
1359                 if (!_pcb.hasWeirdInsns(func) && ah.isGarbageInsn()) {
1360                     // add instrumentation at this addr so we can
1361                     // extend the function if this really executes
1362                     ParseCallback::default_details det(
1363                         (unsigned char*) cur->region()->getPtrToInstruction(cur->lastInsnAddr()),
1364                         cur->end() - cur->lastInsnAddr(),
1365                         true);
1366                     _pcb.abruptEnd_cf(cur->lastInsnAddr(),cur,&det);
1367                     _pcb.foundWeirdInsns(func);
1368                     end_block(cur,ah);
1369                     // allow invalid instructions to end up as a sink node.
1370                     link(cur, _sink, DIRECT, true);
1371                     break;
1372                 } else if (ah.isNopJump()) {
1373                     // patch the jump to make it a nop, and re-set the 
1374                     // instruction adapter so we parse the instruction
1375                     // as a no-op this time, allowing the subsequent
1376                     // instruction to be parsed correctly
1377                     mal_printf("Nop jump at %lx, changing it to nop\n",ah.getAddr());
1378                     _pcb.patch_nop_jump(ah.getAddr());
1379                     unsigned bufsize = 
1380                         func->region()->offset() + func->region()->length() - ah.getAddr();
1381                     const unsigned char* bufferBegin = (const unsigned char *)
1382                         (func->isrc()->getPtrToInstruction(ah.getAddr()));
1383                     dec = InstructionDecoder
1384                         (bufferBegin, bufsize, frame.codereg->getArch());
1385                     ah = InstructionAdapter_t(dec, curAddr, func->obj(), 
1386                                               func->region(), func->isrc(), cur);
1387                 } else {
1388                     entryID id = ah.getInstruction()->getOperation().getID();
1389                     switch (id) {
1390                         case e_rdtsc:
1391                             fprintf(stderr,"parsed bluepill insn rdtsc at %lx\n",curAddr);
1392                             break;
1393                         case e_sldt:
1394                             fprintf(stderr,"parsed bluepill insn sldt at %lx\n",curAddr);
1395                             break;
1396                         default:
1397                             break;
1398                     }
1399                 }
1400             } else {
1401                 // default
1402             }
1403
1404             /** Check for overruns of valid address space **/
1405             if (!is_code(func,ah.getNextAddr())) {
1406                 parsing_printf("[%s] next insn %lx is invalid\n",
1407                                FILE__,ah.getNextAddr());
1408
1409                 end_block(cur,ah);
1410                 // We need to tag the block with a sink edge
1411                 link(cur, _sink, DIRECT, true);
1412                 break;
1413             } else if (!cur->region()->contains(ah.getNextAddr())) {
1414                 parsing_printf("[%s] next address %lx is outside [%lx,%lx)\n",
1415                     FILE__,ah.getNextAddr(),
1416                     cur->region()->offset(),
1417                     cur->region()->offset()+cur->region()->length());
1418                 end_block(cur,ah);
1419                 // We need to tag the block with a sink edge
1420                 link(cur, _sink, DIRECT, true);
1421                 break;
1422             }
1423             ah.advance();
1424         }
1425     }
1426
1427     // Check if parsing is complete
1428     if (!frame.delayedWork.empty()) {
1429         frame.set_status(ParseFrame::FRAME_DELAYED);
1430         return; 
1431     }
1432
1433     /** parsing complete **/
1434     if (HASHDEF(plt_entries,frame.func->addr())) {
1435 //        if (obj().cs()->nonReturning(frame.func->addr())) {
1436         if (obj().cs()->nonReturning(plt_entries[frame.func->addr()])) {        
1437             frame.func->set_retstatus(NORETURN);
1438         } else {
1439             frame.func->set_retstatus(UNKNOWN);
1440         }
1441
1442         // Convenience -- adopt PLT name
1443         frame.func->_name = plt_entries[frame.func->addr()];
1444     } else if (frame.func->_rs == UNSET) {
1445         frame.func->set_retstatus(NORETURN);
1446     }
1447
1448     frame.set_status(ParseFrame::PARSED);
1449
1450     if (unlikely(obj().defensiveMode())) {
1451        // calculate this after setting the function to PARSED, so that when
1452        // we finalize the function we'll actually save the results and won't 
1453        // re-finalize it
1454        func->tampersStack(); 
1455        _pcb.newfunction_retstatus( func );
1456     }
1457 }
1458
1459 void
1460 Parser::end_block(Block * b, InstructionAdapter_t & ah)
1461 {
1462     b->_lastInsn = ah.getAddr();
1463     b->updateEnd(ah.getNextAddr());
1464
1465     record_block(b);
1466 }
1467
1468 void
1469 Parser::record_block(Block *b)
1470 {
1471     parsing_printf("[%s:%d] recording block [%lx,%lx)\n",
1472         FILE__,__LINE__,b->start(),b->end());
1473     _parse_data->record_block(b->region(),b);
1474 }
1475
1476
1477 Block *
1478 Parser::block_at(
1479     Function * owner,
1480     Address addr,
1481     Block * & split)
1482 {
1483     set<Block *> overlap;
1484     Block * exist = NULL;
1485     Block * ret = NULL;
1486     Block * inconsistent = NULL;
1487     Address prev_insn = 0;
1488
1489     split = NULL;
1490
1491     CodeRegion *cr;
1492     if(owner->region()->contains(addr))
1493         cr = owner->region();
1494     else
1495         cr = _parse_data->reglookup(owner->region(),addr);
1496
1497     if(!is_code(owner,addr)) {
1498         parsing_printf("[%s] block address %lx rejected by isCode()\n",
1499             FILE__,addr);
1500         return NULL;
1501     }
1502
1503     if(NULL == (exist = _parse_data->findBlock(cr,addr))) {
1504         _parse_data->findBlocks(cr,addr,overlap);
1505         if(overlap.size() > 1)
1506             parsing_printf("[%s] address %lx overlapped by %d blocks\n",
1507                 FILE__,addr,overlap.size());
1508
1509         /* Platform specific consistency test: 
1510            generally checking for whether the address is an
1511            instruction boundary in a block */
1512         for(set<Block *>::iterator sit=overlap.begin();sit!=overlap.end();++sit)
1513         {
1514             Block * b = *sit;
1515
1516             // consistent will fill in prev_insn with the address of the
1517             // instruction preceeding addr if addr is consistent
1518             if(b->consistent(addr,prev_insn)) {
1519                 exist = b;
1520                 break;   
1521             } else {
1522                 parsing_printf("[%s] %lx is inconsistent with [%lx,%lx)\n", 
1523                     FILE__,addr,b->start(),b->end());
1524                 if(inconsistent) {
1525                     parsing_printf("   multiple inconsistent blocks!\n");
1526                 }
1527                 inconsistent = b;
1528             }
1529         }
1530     }
1531
1532     if(exist) {
1533         if(exist->start() == addr) {
1534             parsing_printf("[%s] block %lx exists\n",
1535                 FILE__,addr);
1536             ret = exist;
1537         }
1538         else {
1539             parsing_printf("[%s] address %lx splits [%lx,%lx) (%p)\n",
1540                FILE__,addr,exist->start(),exist->end(),exist);
1541             if (owner->obj()->defensiveMode()) {
1542                 mal_printf("new block at %lx splits [%lx %lx)\n",
1543                            addr, exist->start(), exist->end());
1544             }
1545             split = exist;
1546             ret = split_block(owner,exist,addr,prev_insn);
1547         }
1548     } else {
1549         ret = factory()._mkblock(owner,cr,addr);
1550         record_block(ret);
1551         _pcb.addBlock(owner, ret);
1552     }
1553
1554     if(unlikely(inconsistent)) {
1555        _pcb.overlapping_blocks(ret,inconsistent); 
1556     }
1557
1558     return ret;
1559 }
1560
1561 pair<Block *,Edge *>
1562 Parser::add_edge(
1563     ParseFrame & frame,
1564     Function * owner,
1565     Block * src,
1566     Address dst,
1567     EdgeTypeEnum et,
1568     Edge * exist)
1569 {
1570     Block * split = NULL;
1571     Block * ret = NULL;
1572     Edge * newedge = NULL;
1573     pair<Block *, Edge *> retpair((Block *) NULL, (Edge *) NULL);
1574
1575     if(!is_code(owner,dst)) {
1576         parsing_printf("[%s] target address %lx rejected by isCode()\n",dst);
1577         return retpair;
1578     }
1579
1580     ret = block_at(owner,dst,split);
1581     retpair.first = ret;
1582
1583     if(split == src) {
1584         // special case -- same block
1585         src = ret;
1586     }
1587
1588     if(split && HASHDEF(frame.visited,split->start())) {
1589         // prevent "delayed parsing" of extant block if 
1590         // this frame has already visited it
1591         frame.visited[ret->start()] = true;
1592         frame.leadersToBlock[ret->start()] = ret;
1593     }
1594
1595     if(NULL == exist) {
1596         newedge = link(src,ret,et,false);
1597         retpair.second = newedge;
1598     } else {
1599         relink(exist,src,ret);
1600         retpair.second = exist;
1601     }
1602
1603     return retpair;
1604 }
1605
1606 Block *
1607 Parser::split_block(
1608     Function * owner, 
1609     Block *b, 
1610     Address addr, 
1611     Address previnsn)
1612 {
1613     Block * ret;
1614     CodeRegion * cr;
1615     bool isRetBlock = false;
1616     if(owner->region()->contains(b->start()))
1617         cr = owner->region();
1618     else
1619         cr = _parse_data->reglookup(owner->region(),b->start());
1620     region_data * rd = _parse_data->findRegion(cr);
1621
1622     // enable for extra-safe testing, but callers are responsbible
1623     // assert(b->consistent(addr);
1624
1625     ret = factory()._mkblock(owner,cr,addr);
1626
1627     // move out edges
1628     vector<Edge *> & trgs = b->_trglist;
1629     vector<Edge *>::iterator tit = trgs.begin(); 
1630     for(;tit!=trgs.end();++tit) {
1631         Edge *e = *tit;
1632         e->_source = ret;
1633         ret->_trglist.push_back(e);
1634     }
1635     if (!trgs.empty() && RET == (*trgs.begin())->type()) {
1636        isRetBlock = true;
1637     }
1638     trgs.clear();
1639     ret->updateEnd(b->_end);
1640     ret->_lastInsn = b->_lastInsn;
1641     ret->_parsed = true;
1642     link(b,ret,FALLTHROUGH,false); 
1643
1644     record_block(ret);
1645
1646     // b's range has changed
1647     rd->blocksByRange.remove(b);
1648     b->updateEnd(addr);
1649     b->_lastInsn = previnsn;
1650     rd->blocksByRange.insert(b); 
1651     // Any functions holding b that have already been finalized
1652     // need to have their caches invalidated so that they will
1653     // find out that they have this new 'ret' block
1654     std::set<Function*> prev_owners;
1655     rd->findFuncs(b->start(),prev_owners);
1656     for(std::set<Function*>::iterator oit = prev_owners.begin();
1657         oit != prev_owners.end(); ++oit)
1658     {
1659         Function * po = *oit;
1660         if (po->_cache_valid) {
1661            po->_cache_valid = false;
1662            parsing_printf("[%s:%d] split of [%lx,%lx) invalidates cache of "
1663                    "func at %lx\n",
1664            FILE__,__LINE__,b->start(),b->end(),po->addr());
1665         }
1666         if (isRetBlock) {
1667            po->_retBL.clear(); //could remove b from the vector instead of clearing it, not sure what's cheaper
1668         }
1669     }
1670     // KEVINTODO: study performance impact of this callback
1671     _pcb.splitBlock(b,ret);
1672
1673     return ret;
1674  }
1675
1676 pair<Function *,Edge*>
1677 Parser::bind_call(ParseFrame & frame, Address target, Block * cur, Edge * exist)
1678 {
1679     Function * tfunc = NULL;
1680     Block * tblock = NULL;
1681     FuncSource how = RT;
1682     if(frame.func->_src == GAP || frame.func->_src == GAPRT)
1683         how = GAPRT;
1684
1685     // look it up
1686     tfunc = _parse_data->get_func(frame.codereg,target,how);
1687     if(!tfunc) {
1688         parsing_printf("[%s:%d] can't bind call to %lx\n",
1689             FILE__,__LINE__,target);
1690         return pair<Function*,Edge*>((Function *) NULL,exist);
1691     }
1692
1693     // add an edge
1694     pair<Block*,Edge*> ret = add_edge(frame,tfunc,cur,target,CALL,exist);
1695     tblock = ret.first;
1696     if(!tblock) {
1697         parsing_printf("[%s:%d] can't bind call to %lx\n",
1698             FILE__,__LINE__,target);
1699         return pair<Function*,Edge*>((Function *) NULL,exist);
1700     }
1701
1702     return pair<Function*,Edge*>(tfunc,ret.second);
1703 }
1704
1705 Function *
1706 Parser::findFuncByEntry(CodeRegion *r, Address entry)
1707 {
1708     if(_parse_state < PARTIAL) {
1709         parsing_printf("[%s:%d] Parser::findFuncByEntry([%lx,%lx),%lx) "
1710                        "forced parsing\n",
1711             FILE__,__LINE__,r->low(),r->high(),entry);
1712         parse();
1713     }
1714     return _parse_data->findFunc(r,entry);
1715 }
1716
1717 int 
1718 Parser::findFuncs(CodeRegion *r, Address addr, set<Function *> & funcs)
1719 {
1720     if(_parse_state < COMPLETE) {
1721         parsing_printf("[%s:%d] Parser::findFuncs([%lx,%lx),%lx,...) "
1722                        "forced parsing\n",
1723             FILE__,__LINE__,r->low(),r->high(),addr);
1724         parse();
1725     }
1726     if(_parse_state < FINALIZED) {
1727         parsing_printf("[%s:%d] Parser::findFuncs([%lx,%lx),%lx,...) "
1728                        "forced finalization\n",
1729             FILE__,__LINE__,r->low(),r->high(),addr);
1730         finalize();
1731     }
1732     return _parse_data->findFuncs(r,addr,funcs);
1733 }
1734
1735 int 
1736 Parser::findFuncs(CodeRegion *r, Address start, Address end, set<Function *> & funcs)
1737 {
1738     if(_parse_state < COMPLETE) {
1739         parsing_printf("[%s:%d] Parser::findFuncs([%lx,%lx),%lx,%lx) "
1740                        "forced parsing\n",
1741             FILE__,__LINE__,r->low(),r->high(),start,end);
1742         parse();
1743     }
1744     if(_parse_state < FINALIZED) {
1745         parsing_printf("[%s:%d] Parser::findFuncs([%lx,%lx),%lx,%lx) "
1746                        "forced finalization\n",
1747             FILE__,__LINE__,r->low(),r->high(),start,end);
1748         finalize();
1749     }
1750     return _parse_data->findFuncs(r,start,end,funcs);
1751 }
1752
1753 Block *
1754 Parser::findBlockByEntry(CodeRegion *r, Address entry)
1755 {
1756     if(_parse_state < PARTIAL) {
1757         parsing_printf("[%s:%d] Parser::findBlockByEntry([%lx,%lx),%lx) "
1758                        "forced parsing\n",
1759             FILE__,__LINE__,r->low(),r->high(),entry);
1760         parse();
1761     }
1762     return _parse_data->findBlock(r,entry);
1763 }
1764
1765 Block *
1766 Parser::findNextBlock(CodeRegion *r, Address addr)
1767 {
1768     if(_parse_state < PARTIAL) {
1769         parsing_printf("[%s:%d] Parser::findBlockByEntry([%lx,%lx),%lx) "
1770                        "forced parsing\n",
1771             FILE__,__LINE__,r->low(),r->high(),addr);
1772         parse();
1773     }
1774     return _parse_data->findRegion(r)->get_next_block(addr).second;
1775 }
1776
1777 int
1778 Parser::findBlocks(CodeRegion *r, Address addr, set<Block *> & blocks)
1779 {
1780     if(_parse_state < COMPLETE) {
1781         parsing_printf("[%s:%d] Parser::findBlocks([%lx,%lx),%lx,...) "
1782                        "forced parsing\n",
1783             FILE__,__LINE__,r->low(),r->high(),addr);
1784         parse();
1785     }
1786     return _parse_data->findBlocks(r,addr,blocks); 
1787 }
1788
1789 // find blocks without parsing.
1790 int Parser::findCurrentBlocks(CodeRegion* cr, Address addr, 
1791                                 std::set<Block*>& blocks) {
1792     return _parse_data->findBlocks(cr, addr, blocks);
1793 }
1794
1795 Edge*
1796 Parser::link(Block *src, Block *dst, EdgeTypeEnum et, bool sink)
1797 {
1798     assert(et != NOEDGE);
1799     Edge * e = factory()._mkedge(src,dst,et);
1800     e->_type._sink = sink;
1801     src->_trglist.push_back(e);
1802     dst->_srclist.push_back(e);
1803     _pcb.addEdge(src, e, ParseCallback::target);
1804     _pcb.addEdge(dst, e, ParseCallback::source);
1805     return e;
1806 }
1807
1808 /* 
1809  * During parsing, all edges are temporarily linked to the _tempsink
1810  * block. Adding and then removing edges to and from this block is
1811  * wasteful, especially given that removal is an O(N) operation with
1812  * vector storage. This call thus does not record edges into the sink.
1813  * These edges are guaranteed to have been relinked when parsing is
1814  * in state COMPLETE.
1815  *
1816  * NB This introduces an inconsistency into the CFG invariant that all
1817  *    targets of an edge have that edge in their source list if the
1818  *    data structures are queried during parsing. Extenders of 
1819  *    parsing callbacks are the only ones who can see this state;
1820  *    they have been warned and should know better.
1821  */
1822 Edge*
1823 Parser::link_tempsink(Block *src, EdgeTypeEnum et)
1824 {
1825     Edge * e = factory()._mkedge(src,_sink,et);
1826     e->_type._sink = true;
1827     src->_trglist.push_back(e);
1828     return e;
1829 }
1830
1831 void
1832 Parser::relink(Edge * e, Block *src, Block *dst)
1833 {
1834     bool addSrcAndDest = true;
1835     if(src != e->src()) {
1836         e->src()->removeTarget(e);
1837         _pcb.removeEdge(e->src(), e, ParseCallback::target);
1838         e->_source = src;
1839         src->addTarget(e);
1840         _pcb.addEdge(src, e, ParseCallback::target);
1841         addSrcAndDest = false;
1842     }
1843     if(dst != e->trg()) { 
1844         if(e->trg() != _sink) {
1845             e->trg()->removeSource(e);
1846             _pcb.removeEdge(e->trg(), e, ParseCallback::source);
1847             addSrcAndDest = false;
1848         }
1849         e->_target = dst;
1850         dst->addSource(e);
1851         _pcb.addEdge(dst, e, ParseCallback::source);
1852         if (addSrcAndDest) {
1853             // We're re-linking a sinkEdge to be a non-sink edge; since 
1854             // we don't inform PatchAPI of temporary sinkEdges, we have
1855             // to add both the source AND target edges
1856             _pcb.addEdge(src, e, ParseCallback::target);
1857         }
1858     }
1859
1860     e->_type._sink = (dst == _sink);
1861 }
1862
1863 ParseFrame::Status
1864 Parser::frame_status(CodeRegion * cr, Address addr)
1865 {
1866     // XXX parsing frames may have been cleaned up, but status
1867     //     is always cached
1868     return _parse_data->frameStatus(cr,addr);
1869 }
1870
1871 void
1872 Parser::remove_func(Function *func)
1873 {
1874     if (sorted_funcs.end() != sorted_funcs.find(func)) {
1875         sorted_funcs.erase(func);
1876     }
1877     if (HINT == func->src()) {
1878         for (unsigned fidx=0; fidx < hint_funcs.size(); fidx++) {
1879             if (hint_funcs[fidx] == func) {
1880                 hint_funcs[fidx] = hint_funcs[hint_funcs.size()-1];
1881                 hint_funcs.pop_back();
1882                 break;
1883             }
1884         }
1885     }
1886     else {
1887         for (unsigned fidx=0; fidx < discover_funcs.size(); fidx++) {
1888             if (discover_funcs[fidx] == func) {
1889                 discover_funcs[fidx] = discover_funcs[discover_funcs.size()-1];
1890                 discover_funcs.pop_back();
1891                 break;
1892             }
1893         }
1894     }
1895     
1896     _parse_data->remove_func(func);
1897 }
1898
1899 void
1900 Parser::remove_block(Dyninst::ParseAPI::Block *block)
1901 {
1902     _parse_data->remove_block(block);
1903 }
1904
1905 void Parser::move_func(Function *func, Address new_entry, CodeRegion *new_reg)
1906 {
1907     region_data *reg_data = _parse_data->findRegion(func->region());
1908     reg_data->funcsByAddr.erase(func->addr());
1909
1910     reg_data = _parse_data->findRegion(new_reg);
1911     reg_data->funcsByAddr[new_entry] = func;
1912 }
1913
1914 void Parser::invalidateContainingFuncs(Function *owner, Block *b)
1915 {
1916     CodeRegion * cr;
1917     if(owner->region()->contains(b->start()))
1918         cr = owner->region();
1919     else
1920         cr = _parse_data->reglookup(owner->region(),b->start());
1921     region_data * rd = _parse_data->findRegion(cr);
1922     
1923
1924     // Any functions holding b that have already been finalized
1925     // need to have their caches invalidated so that they will
1926     // find out that they have this new 'ret' block
1927     std::set<Function*> prev_owners;
1928     rd->findFuncs(b->start(),prev_owners);
1929     for(std::set<Function*>::iterator oit = prev_owners.begin();
1930         oit != prev_owners.end(); ++oit)
1931     {
1932         Function * po = *oit;
1933         po->_cache_valid = false;
1934         parsing_printf("[%s:%d] split of [%lx,%lx) invalidates cache of "
1935                        "func at %lx\n",
1936                        FILE__,__LINE__,b->start(),b->end(),po->addr());
1937     }   
1938 }
1939
1940 /* Add ParseFrames waiting on func back to the work queue */
1941 void Parser::resumeFrames(Function * func, vector<ParseFrame *> & work)
1942 {
1943     // If we do not know the function's return status, don't put its waiters back on the worklist
1944     if (func->_rs == UNSET) { 
1945         parsing_printf("[%s] %s return status unknown, cannot resume waiters\n",
1946                 __FILE__,
1947                 func->name().c_str());
1948         return; 
1949     }
1950
1951     // When a function's return status is set, all waiting frames back into the worklist
1952     map<Function *, set<ParseFrame *> >::iterator iter = delayedFrames.find(func);
1953     if (iter == delayedFrames.end()) {
1954         // There were no frames waiting, ignore
1955         parsing_printf("[%s] %s return status %d, no waiters\n",
1956                 __FILE__,
1957                 func->name().c_str(),
1958                 func->_rs);
1959         return;
1960     } else {
1961         parsing_printf("[%s] %s return status %d, undelaying waiting functions\n",
1962                 __FILE__,
1963                 func->name().c_str(),
1964                 func->_rs);
1965         // Add each waiting frame back to the worklist
1966         set<ParseFrame *> vec = iter->second;
1967         for (set<ParseFrame *>::iterator fIter = vec.begin(); 
1968                 fIter != vec.end();
1969                 ++fIter) {
1970             work.push_back(*fIter);
1971         }
1972         // remove func from delayedFrames map
1973         delayedFrames.erase(func);
1974     }
1975 }
1976
1977 bool Parser::getSyscallNumber(Function * /*func*/,
1978         Block * block,
1979         Address /*addr*/,
1980         Architecture arch,
1981         long int & val)
1982 {
1983     val = -1;
1984
1985     // In the common case, the setup of system call number
1986     // is a mov in the previous instruction. We don't currently look elsewhere.
1987     // In the future, could use slicing and symeval to find the value of
1988     // this register at the system call (as unstrip does).
1989     Block::Insns blockInsns;
1990     block->getInsns(blockInsns);
1991     auto prevPair = ++(blockInsns.rbegin());
1992     InstructionAPI::InstructionPtr prevInsn = prevPair->second;
1993     if (prevInsn->getOperation().getID() != e_mov) {
1994         return false;
1995     }
1996
1997     MachRegister syscallNumberReg = MachRegister::getSyscallNumberReg(arch);
1998     if (syscallNumberReg == InvalidReg) {
1999         return false;
2000     }
2001     InstructionAPI::RegisterAST* regAST = new InstructionAPI::RegisterAST(syscallNumberReg);
2002     InstructionAPI::RegisterAST::Ptr regASTPtr = InstructionAPI::RegisterAST::Ptr(regAST);
2003
2004     std::vector<InstructionAPI::Operand> operands;
2005     prevInsn->getOperands(operands);
2006     for (unsigned i = 0; i < operands.size(); i++) {
2007         if (!operands[i].isWritten(regASTPtr)) {
2008             InstructionAPI::Expression::Ptr value = operands[i].getValue();
2009             InstructionAPI::Result res = value->eval();
2010             if (res.defined) {
2011                 val = res.convert<Offset>();
2012             }
2013         }
2014     }
2015
2016     return (val != -1);
2017 }
2018