2 * See the dyninst/COPYRIGHT file for copyright information.
4 * We provide the Paradyn Tools (below described as "Paradyn")
5 * on an AS IS basis, and do not warrant its validity or performance.
6 * We reserve the right to update, modify, or discontinue this
7 * software at any time. We shall have no obligation to supply such
8 * updates or modifications or any other form of support to you.
10 * By your use of Paradyn, you understand and agree that we (or any
11 * other person or entity with proprietary rights in Paradyn) are
12 * under no obligation to provide either maintenance services,
13 * update services, notices of latent defects, or correction of
14 * defects for Paradyn.
16 * This library is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU Lesser General Public
18 * License as published by the Free Software Foundation; either
19 * version 2.1 of the License, or (at your option) any later version.
21 * This library is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 * Lesser General Public License for more details.
26 * You should have received a copy of the GNU Lesser General Public
27 * License along with this library; if not, write to the Free Software
28 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
31 #if defined(cap_stripped_binaries)
41 #include "InstructionDecoder.h"
42 #include "Instruction.h"
44 #include "ProbabilisticParser.h"
47 using namespace Dyninst;
48 using namespace Dyninst::ParseAPI;
49 using namespace Dyninst::InstructionAPI;
50 using namespace NS_x86;
53 clock_t ProbabilityCalculator::totalClocks = 0;
55 // Precision error allowed in double precision float number
57 static int double_cmp(double a, double b) {
59 if (fabs(delta) < ZERO) return 0;
60 else if (delta > ZERO) return 1;
68 ProbAndAddr(Address a, double p): addr(a), prob(p) {}
69 bool operator < (const ProbAndAddr &p) const {
70 if (double_cmp(prob, p.prob) == 0) return addr > p.addr;
74 static bool MatchArgs(unsigned short arg1, unsigned short arg2) {
78 bool IdiomTerm::match(const IdiomTerm &it) const {
79 if (entry_id == WILDCARD_ENTRY_ID || it.entry_id == WILDCARD_ENTRY_ID) return true; // Wildcard matches everything
80 if (entry_id == e_nop && it.entry_id == e_nop) return true; // Nops match without considering arguments
81 if (entry_id != it.entry_id) return false;
82 return MatchArgs(arg1, it.arg1) && MatchArgs(arg2, it.arg2);
85 bool IdiomTerm::matchOpcode(unsigned short eid) const {
86 if (entry_id == WILDCARD_ENTRY_ID || eid == WILDCARD_ENTRY_ID) return true; // Wildcard matches everything
87 return entry_id == eid;
90 bool IdiomTerm::operator == (const IdiomTerm &it) const {
91 return (entry_id == it.entry_id) && (arg1 == it.arg1) && (arg2 == it.arg2);
94 bool IdiomTerm::operator < (const IdiomTerm &it) const {
95 if (entry_id != it.entry_id) return entry_id < it.entry_id;
96 if (arg1 != it.arg1) return arg1 < it.arg1;
97 return arg2 < it.arg2;
100 static string HandleAnOperand(unsigned short arg, int style) {
103 if (style) return "MEM"; else return ":A";
105 else if (arg == IMMARG) {
106 if (style) return "IMM"; else return ":B";
108 else if (arg == MULTIREG) {
109 if (style) return "MULTIREG"; else return ":C";
111 else if (arg == CALLEESAVEREG) {
112 if (style) return "Callee saved reg"; else return ":D";
114 else if (arg == ARGUREG) {
115 if (style) return "Argu passing reg"; else return ":E";
117 else if (arg == OTHERREG) {
118 if (style) return "Other reg"; else return ":F";
121 // the human_readble format is still broken
123 if (arg == 0x0010) return x86_64::rip.name();
124 switch (arg & 0xf000) {
127 return MachRegister(arg | x86_64::GPR | Arch_x86_64).name();
130 return MachRegister(arg | x86_64::MMX | Arch_x86_64).name();
133 return MachRegister(arg | x86_64::XMM | Arch_x86_64).name();
136 return MachRegister(arg | x86_64::FLAG | Arch_x86_64).name();
140 return ":" + MachRegister(arg).name();
143 snprintf(buf, 64, "%x", arg);
144 return string(":") + string(buf);
151 string IdiomTerm::human_format() const {
153 if(*this == WILDCARD_TERM)
155 else if(entryNames_IAPI.find((entryID)(entry_id)) == entryNames_IAPI.end()) {
156 entryname = "[UNMAPED]";
157 fprintf(stderr, "Found entryID not mapped in entryNames_IAPI %d\n", entry_id);
160 entryname = entryNames_IAPI[(entryID)(entry_id)];
161 if (arg1 != NOARG) entryname += " " + HandleAnOperand(arg1, 1);
162 if (arg2 != NOARG) entryname += "," + HandleAnOperand(arg2, 1);
169 bool Idiom::operator < (const Idiom &i) const {
170 if (terms.size() != i.terms.size()) return terms.size() < i.terms.size();
171 for (size_t index = 0; index < terms.size(); ++index)
172 if (terms[index] < i.terms[index]) return true;
173 else if (i.terms[index] < terms[index]) return false;
177 static void split(const char * str, vector<uint64_t> & terms)
179 const char *s = str, *e = NULL;
182 while((e = strchr(s,'_')) != NULL) {
187 #if defined (os_windows)
188 #define dyn_strtoull _strtoui64
190 #define dyn_strtoull strtoull
192 terms.push_back(dyn_strtoull(buf,NULL,16));
198 terms.push_back(dyn_strtoull(s,NULL,16));
202 Idiom::Idiom(string format, double weight, bool pre):
203 w(weight), prefix(pre)
205 vector<uint64_t> items;
206 split(format.c_str(), items);
208 unsigned short entry_id, arg1, arg2;
209 for (size_t i = 0; i < items.size(); ++i) {
210 uint64_t t = items[i];
211 if(!(t & ENTRY_MASK)) {
215 if(!(t & ENTRY_MASK)) {
220 entry_id = (t>>32) & 0xffffffff;
221 arg1 = (t>>16) & 0xffff;
223 terms.push_back(IdiomTerm(entry_id, arg1, arg2));
226 if (prefix) reverse(terms.begin(), terms.end());
229 string Idiom::human_format() const {
232 //printf("formatting %s\n",format().c_str());
233 for(unsigned i=0;i<terms.size();++i) {
234 ret += terms[i].human_format();
242 IdiomPrefixTree::IdiomPrefixTree() {
246 void IdiomPrefixTree::addIdiom(const Idiom& idiom) {
250 void IdiomPrefixTree::addIdiom(int cur, const Idiom& idiom) {
251 if (cur == (int)idiom.terms.size()) {
255 ChildrenByEntryID::iterator idit = childrenClusters.find(idiom.terms[cur].entry_id);
256 if (idit == childrenClusters.end()) {
257 idit = childrenClusters.insert(make_pair(idiom.terms[cur].entry_id,ChildrenType())).first;
259 ChildrenType& children = idit->second;
260 ChildrenType::iterator next = children.end();
261 for (auto cit = children.begin(); cit != children.end(); ++cit) {
262 if (cit->first == idiom.terms[cur]) {
268 if (next == children.end()) {
269 children.push_back(make_pair(idiom.terms[cur], new IdiomPrefixTree() ) );
270 next = children.end();
273 next->second->addIdiom(cur+1, idiom);
277 bool IdiomPrefixTree::findChildrenWithOpcode(unsigned short entry_id, ChildrenType &ret) {
278 ChildrenByEntryID::iterator idit = childrenClusters.find(entry_id);
279 if (idit != childrenClusters.end()) {
280 ret.insert(ret.end(), idit->second.begin(), idit->second.end());
282 idit = childrenClusters.find(WILDCARD_ENTRY_ID);
283 if (idit != childrenClusters.end()) {
284 ret.insert(ret.end(), idit->second.begin(), idit->second.end());
289 bool IdiomPrefixTree::findChildrenWithArgs(unsigned short arg1, unsigned short arg2, const ChildrenType &candidate, ChildrenType &ret) {
290 for (auto cit = candidate.begin(); cit != candidate.end(); ++cit) {
291 if (cit->first.match(IdiomTerm(cit->first.entry_id, arg1, arg2)))
297 const IdiomPrefixTree::ChildrenType* IdiomPrefixTree::getChildrenByEntryID(unsigned short entry_id) {
298 ChildrenByEntryID::iterator iter = childrenClusters.find(entry_id);
299 if (iter == childrenClusters.end())
302 return &iter->second;
304 const IdiomPrefixTree::ChildrenType* IdiomPrefixTree::getWildCardChildren() {
305 return getChildrenByEntryID(WILDCARD_ENTRY_ID);
307 ProbabilityCalculator::ProbabilityCalculator(CodeRegion *reg, CodeSource *source, Parser* p, string model_spec):
308 model(model_spec), cr(reg), cs(source), parser(p)
312 static bool PassPreCheck(unsigned char *buf) {
313 if (buf == NULL) return false;
314 if (*buf == 0 || *buf == 0x90) return false;
318 double ProbabilityCalculator::calcProbByMatchingIdioms(Address addr) {
319 if (FEPProb.find(addr) != FEPProb.end())
320 return FEPProb[addr];
321 unsigned char *buf = (unsigned char*)(cs->getPtrToInstruction(addr));
322 if (!PassPreCheck(buf)) return 0;
323 // if (addr != 0x8049d68) return 0;
324 double w = model.getBias();
326 // fprintf(stderr, "before forward matching w = %.6lf\n", w);
327 w += calcForwardWeights(0, addr, model.getNormalIdiomTreeRoot(), valid);
328 // fprintf(stderr, "after forward matching w = %.6lf\n", w);
331 set<IdiomPrefixTree*> matched;
332 w += calcBackwardWeights(0, addr, model.getPrefixIdiomTreeRoot(), matched);
333 // fprintf(stderr, "after backward matching w = %.6lf\n", w);
335 double prob = ((double)1) / (1 + exp(-w));
336 return FEPProb[addr] = reachingProb[addr] = prob;
337 } else return FEPProb[addr] = reachingProb[addr] = 0;
340 void ProbabilityCalculator::calcProbByEnforcingConstraints() {
342 // Initialize our knowledge about non-gap functions
344 for (auto fit = parser->obj().funcs().begin(); fit != parser->obj().funcs().end(); ++fit) {
345 finalized.insert(*fit);
346 for (auto bit = (*fit)->blocks().begin(); bit != (*fit)->blocks().end(); ++bit) {
347 for (Address addr = (*bit)->start(); addr != (*bit)->end(); ++addr)
348 reachingProb[addr] = 2;
352 priority_queue<ProbAndAddr> q;
353 double threshold = model.getProbThreshold();
354 // Use the prob from matching idioms and push all the FEP candidates into the priority queue
355 for (auto pit = FEPProb.begin(); pit != FEPProb.end(); ++pit)
356 if (pit->second >= threshold)
357 q.push(ProbAndAddr(pit->first, pit->second));
360 ProbAndAddr pa = q.top();
363 //This is an out-dated item. The address has either improved prob by
364 //applying call consistency constraints or 0 prob by applying overlapping constraints.
365 if (double_cmp(FEPProb[pa.addr], pa.prob) != 0) continue;
367 parser->parse_at(cr, pa.addr, true, GAP);
369 Function *f = parser->findFuncByEntry(cr, pa.addr);
372 fprintf(stderr, "Enforcing constraints at %lx with prob %.6lf\n", pa.addr, pa.prob);
373 // Enforce the overlapping constraints
374 dyn_hash_map<Address, double> newFEPProb, newReachingProb;
375 dyn_hash_set<Function *> newDiscoveredFuncs;
376 if (enforceOverlappingConstraints(f, pa.addr, pa.prob, newFEPProb, newReachingProb, newDiscoveredFuncs)) {
377 Finalize(newFEPProb, newReachingProb, newDiscoveredFuncs);
378 // Enforce the call consistency constraints
380 Remove(newDiscoveredFuncs);
386 double ProbabilityCalculator::getFEPProb(Address addr) {
387 if (FEPProb.find(addr) != FEPProb.end()) return FEPProb[addr];
391 double ProbabilityCalculator::getReachingProb(Address addr) {
392 if (reachingProb.find(addr) != reachingProb.end()) return reachingProb[addr];
396 bool ProbabilityCalculator::isFEP(Address addr) {
397 double prob = getFEPProb(addr);
398 if (prob >= model.getProbThreshold()) return true; else return false;
401 double ProbabilityCalculator::calcForwardWeights(int cur, Address addr, IdiomPrefixTree *tree, bool &valid) {
402 if (addr >= cr->high()) return 0;
403 // printf("Start matching at %lx for %dth idiom term\n", addr, cur);
406 if (tree->isFeature()) w = tree->getWeight();
408 if (tree->isLeafNode()) return w;
411 if (!decodeInstruction(data, addr)) {
416 const IdiomPrefixTree::ChildrenType* children = tree->getChildrenByEntryID(data.entry_id);
417 if (children != NULL) {
418 for (auto cit = children->begin(); cit != children->end() && valid; ++cit)
419 if (cit->first.match(IdiomTerm(cit->first.entry_id, data.arg1, data.arg2))) {
420 w += calcForwardWeights(cur + 1, addr + data.len, cit->second, valid);
423 if (!valid) return 0;
424 // Wildcard terms also match the current instruction
425 children = tree->getWildCardChildren();
426 if (children != NULL) {
427 // Note that for a wildcard term,
428 // there is no need to really check whether the operands match or not,
429 // but at least we know that the current address can
430 // be decoded into a valid instruction.
431 for (auto cit = children->begin(); cit != children->end() && valid; ++cit)
432 w += calcForwardWeights(cur + 1, addr + data.len, cit->second, valid);
435 // the return value is not important if "valid" becomes false
439 double ProbabilityCalculator::calcBackwardWeights(int cur, Address addr, IdiomPrefixTree *tree, set<IdiomPrefixTree*> &matched) {
441 if (tree->isFeature()) {
442 if (matched.find(tree) == matched.end()) {
443 matched.insert(tree);
444 w += tree->getWeight();
445 // fprintf(stderr, "Backward match idiom with weight %.6lf\n", tree->getWeight());
448 // printf("Start matching at %lx for %dth idiom term\n", addr, cur);
450 if (tree->isLeafNode()) return w;
452 for (Address prevAddr = addr - 1; prevAddr >= cr->low() && addr - prevAddr <= 15; --prevAddr) {
454 if (!decodeInstruction(data, prevAddr)) continue;
455 if (prevAddr + data.len != addr) continue;
457 // Look for idioms that match the exact current instruction
458 const IdiomPrefixTree::ChildrenType* children = tree->getChildrenByEntryID(data.entry_id);
459 if (children != NULL) {
460 for (auto cit = children->begin(); cit != children->end(); ++cit)
461 if (cit->first.match(IdiomTerm(cit->first.entry_id, data.arg1, data.arg2))) {
462 w += calcBackwardWeights(cur + 1, prevAddr , cit->second, matched);
465 // Wildcard terms also match the current instruction
466 children = tree->getWildCardChildren();
467 if (children != NULL) {
468 for (auto cit = children->begin(); cit != children->end(); ++cit)
469 w += calcBackwardWeights(cur + 1, prevAddr , cit->second, matched);
476 bool ProbabilityCalculator::decodeInstruction(DecodeData &data, Address addr) {
477 DecodeCache::iterator iter = decodeCache.find(addr);
478 if (iter != decodeCache.end()) {
480 if (data.len == 0) return false;
482 unsigned char *buf = (unsigned char*)(cs->getPtrToInstruction(addr));
484 decodeCache.insert(make_pair(addr, DecodeData(JUNK_OPCODE, 0,0,0)));
487 InstructionDecoder dec( buf , 30, cs->getArch());
488 Instruction::Ptr insn = dec.decode();
490 decodeCache.insert(make_pair(addr, DecodeData(JUNK_OPCODE, 0,0,0)));
493 data.len = insn->size();
495 decodeCache.insert(make_pair(addr, DecodeData(JUNK_OPCODE, 0,0,0)));
499 const Operation & op = insn->getOperation();
500 data.entry_id = op.getID();
503 insn->getOperands(ops);
504 int args[2] = {NOARG,NOARG};
505 for(unsigned int i=0;i<2 && i<ops.size();++i) {
506 Operand & op = ops[i];
507 if (op.getValue()->size() == 0) {
508 // This is actually an invalid instruction with valid opcode
509 // so modify the opcode cache to make it invalid
510 decodeCache.insert(make_pair(addr, DecodeData(JUNK_OPCODE, 0,0,0)));
514 if(!op.readsMemory() && !op.writesMemory()) {
515 // register or immediate
516 set<RegisterAST::Ptr> regs;
518 op.getWriteSet(regs);
521 if (regs.size() > 1) {
524 args[i] = (*regs.begin())->getID();
536 decodeCache.insert(make_pair(addr, data));
543 bool ProbabilityCalculator::enforceOverlappingConstraints(Function *f,
546 dyn_hash_map<Address, double> &newFEPProb,
547 dyn_hash_map<Address, double> &newReachingProb,
548 dyn_hash_set<Function*> &newDiscoveredFuncs) {
549 // Only apply the overlapping constraints to un-finalized functions
550 if (finalized.find(f) != finalized.end()) return true;
551 if (newDiscoveredFuncs.find(f) != newDiscoveredFuncs.end()) return true;
552 newDiscoveredFuncs.insert(f);
553 fprintf(stderr," in function at %lx\n", f->addr());
554 for (auto bit = f->blocks().begin(); bit != f->blocks().end(); ++bit) {
555 if ((*bit)->region() != cr) continue;
556 fprintf(stderr, " block [%lx,%lx)\n", (*bit)->start(), (*bit)->end());
557 for (Address addr = (*bit)->start(); addr < (*bit)->end(); ++addr) {
559 // Meet a byte that is in a function with higher probability: conflict
560 if (double_cmp(cur_prob, getReachingProb(addr)) < 0) {
561 fprintf(stderr, " conflict with %lx with reaching prob %.6lf\n", addr, getReachingProb(addr));
563 } else if (double_cmp(cur_prob, getReachingProb(addr)) > 0) {
564 newReachingProb[addr] = cur_prob;
565 newFEPProb[addr] = 0;
567 // TODO: meet a byte with same prob
572 auto call_edges = f->callEdges();
573 for (auto eit = call_edges.begin(); eit != call_edges.end(); ++eit) {
574 if ((*eit)->type() == CALL_FT) continue;
575 Address target = (*eit)->trg()->start();
576 if (reachingProb.find(target) == reachingProb.end()) continue;
577 if (double_cmp(cur_prob, getFEPProb(target)) > 0) {
578 newFEPProb[target] = cur_prob;
580 if (double_cmp(cur_prob, getReachingProb(target)) > 0){
581 newReachingProb[target] = cur_prob;
583 Function *callee = parser->findFuncByEntry(cr, target);
584 // fprintf(stderr, " find call target from %lx to %lx\n", (*eit)->src()->last(), target);
585 if (callee == NULL) {
586 parser->parse_at(cr, target, true, GAP);
587 callee = parser->findFuncByEntry(cr, target);
588 // assert(callee != NULL);
590 if (callee == NULL) continue;
591 if (!enforceOverlappingConstraints(callee, cur_addr, cur_prob, newFEPProb, newReachingProb, newDiscoveredFuncs)) return false;
596 void ProbabilityCalculator::Finalize(dyn_hash_map<Address, double> &newFEPProb,
597 dyn_hash_map<Address, double> &newReachingProb,
598 dyn_hash_set<Function*> &newDiscoveredFuncs) {
599 for (auto nit = newFEPProb.begin(); nit != newFEPProb.end(); ++nit) {
600 FEPProb[nit->first] = nit->second;
602 for (auto nit = newReachingProb.begin(); nit != newReachingProb.end(); ++nit) {
603 reachingProb[nit->first] = nit->second;
605 finalized.insert(newDiscoveredFuncs.begin(), newDiscoveredFuncs.end());
608 void ProbabilityCalculator::Remove(dyn_hash_set<Function*> &newDiscoveredFuncs) {
609 for (auto fit = newDiscoveredFuncs.begin(); fit != newDiscoveredFuncs.end(); ++fit) {
610 parser->remove_func(*fit);
614 void ProbabilityCalculator::prioritizedGapParsing() {
615 priority_queue<ProbAndAddr> q;
616 double threshold = model.getProbThreshold();
618 // Use the prob from matching idioms and push all the FEP candidates into the priority queue
619 for (auto pit = FEPProb.begin(); pit != FEPProb.end(); ++pit)
620 if (pit->second >= threshold)
621 q.push(ProbAndAddr(pit->first, pit->second));
624 ProbAndAddr pa = q.top();
626 parser->parse_at(cr, pa.addr, true, GAP);