ghidra/Ghidra/Features/Decompiler/src/decompile/cpp/sleigh.hh

525 lines
20 KiB
C++

/* ###
* IP: GHIDRA
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// \file sleigh.hh
/// \brief Classes and utilities for the main SLEIGH engine
#ifndef __SLEIGH__
#define __SLEIGH__
#include "sleighbase.hh"
class LoadImage;
/// \brief Class for describing a relative p-code branch destination
///
/// An intra-instruction p-code branch takes a \e relative operand.
/// The actual value produced during p-code generation is calculated at
/// the last second using \b this. It stores the index of the BRANCH
/// instruction and a reference to its destination operand. This initially
/// holds a reference to a destination \e label symbol, but is later updated
/// with the final relative value.
struct RelativeRecord {
VarnodeData *dataptr; ///< Varnode indicating relative offset
uintb calling_index; ///< Index of instruction containing relative offset
};
/// \brief Data for building one p-code instruction
///
/// Raw data used by the emitter to produce a single PcodeOp
struct PcodeData {
OpCode opc; ///< The op code
VarnodeData *outvar; ///< Output Varnode data (or null)
VarnodeData *invar; ///< Array of input Varnode data
int4 isize; ///< Number of input Varnodes
};
/// \brief Class for caching a chunk of p-code, prior to emitting
///
/// The engine accumulates PcodeData and VarnodeData objects for
/// a single instruction. Once the full instruction is constructed,
/// the objects are passed to the emitter (PcodeEmit) via the emit() method.
/// The class acts as a pool of memory for PcodeData and VarnodeData objects
/// that can be reused repeatedly to emit multiple instructions.
class PcodeCacher {
VarnodeData *poolstart; ///< Start of the pool of VarnodeData objects
VarnodeData *curpool; ///< First unused VarnodeData
VarnodeData *endpool; ///< End of the pool of VarnodeData objects
vector<PcodeData> issued; ///< P-code ops issued for the current instruction
list<RelativeRecord> label_refs; ///< References to labels
vector<uintb> labels; ///< Locations of labels
VarnodeData *expandPool(uint4 size); ///< Expand the memory pool
public:
PcodeCacher(void); ///< Constructor
~PcodeCacher(void); ///< Destructor
/// \brief Allocate data objects for a new set of Varnodes
///
/// \param size is the number of objects to allocate
/// \return a pointer to the array of available VarnodeData objects
VarnodeData *allocateVarnodes(uint4 size) {
VarnodeData *newptr = curpool + size;
if (newptr <= endpool) {
VarnodeData *res = curpool;
curpool = newptr;
return res;
}
return expandPool(size);
}
/// \brief Allocate a data object for a new p-code operation
///
/// \return the new PcodeData object
PcodeData *allocateInstruction(void) {
issued.emplace_back();
PcodeData *res = &issued.back();
res->outvar = (VarnodeData *)0;
res->invar = (VarnodeData *)0;
return res;
}
void addLabelRef(VarnodeData *ptr); ///< Denote a Varnode holding a \e relative \e branch offset
void addLabel(uint4 id); ///< Attach a label to the \e next p-code instruction
void clear(void); ///< Reset the cache so that all objects are unallocated
void resolveRelatives(void); ///< Rewrite branch target Varnodes as \e relative offsets
void emit(const Address &addr,PcodeEmit *emt) const; ///< Pass the cached p-code data to the emitter
};
/// \brief A container for disassembly context used by the SLEIGH engine
///
/// This acts as a factor for the ParserContext objects which are used to disassemble
/// a single instruction. These all share a ContextCache which is a front end for
/// accessing the ContextDatabase and resolving context variables from the SLEIGH spec.
/// ParserContext objects are stored in a hash-table keyed by the address of the instruction.
class DisassemblyCache {
Translate *translate; ///< The Translate object that owns this cache
ContextCache *contextcache; ///< Cached values from the ContextDatabase
AddrSpace *constspace; ///< The constant address space
int4 minimumreuse; ///< Can call getParserContext this many times, before a ParserContext is reused
uint4 mask; ///< Size of the hashtable in form 2^n-1
ParserContext **list; ///< (circular) array of currently cached ParserContext objects
int4 nextfree; ///< Current end/beginning of circular list
ParserContext **hashtable; ///< Hashtable for looking up ParserContext via Address
void initialize(int4 min,int4 hashsize); ///< Initialize the hash-table of ParserContexts
void free(void); ///< Free the hash-table of ParserContexts
public:
DisassemblyCache(Translate *trans,ContextCache *ccache,AddrSpace *cspace,int4 cachesize,int4 windowsize); ///< Constructor
~DisassemblyCache(void) { free(); } ///< Destructor
ParserContext *getParserContext(const Address &addr); ///< Get the parser for a particular Address
};
/// \brief Build p-code from a pre-parsed instruction
///
/// Through the build() method, \b this walks the parse tree and prepares data
/// for final emission as p-code. (The final emitting is done separately through the
/// PcodeCacher.emit() method). Generally, only p-code for one instruction is prepared.
/// But, through the \b delay-slot mechanism, build() may recursively visit
/// additional instructions.
class SleighBuilder : public PcodeBuilder {
virtual void dump( OpTpl *op );
AddrSpace *const_space; ///< The constant address space
AddrSpace *uniq_space; ///< The unique address space
uintb uniquemask; ///< Mask of address bits to use to uniquify temporary registers
uintb uniqueoffset; ///< Uniquifier bits for \b this instruction
DisassemblyCache *discache; ///< Cache of disassembled instructions
PcodeCacher *cache; ///< Cache accumulating p-code data for the instruction
void buildEmpty(Constructor *ct,int4 secnum);
void generateLocation(const VarnodeTpl *vntpl,VarnodeData &vn);
AddrSpace *generatePointer(const VarnodeTpl *vntpl,VarnodeData &vn);
void generatePointerAdd(PcodeData *op,const VarnodeTpl *vntpl);
void setUniqueOffset(const Address &addr); ///< Set uniquifying bits for the current instruction
public:
SleighBuilder(ParserWalker *w,DisassemblyCache *dcache,PcodeCacher *pc,AddrSpace *cspc,AddrSpace *uspc,uint4 umask);
virtual void appendBuild(OpTpl *bld,int4 secnum);
virtual void delaySlot(OpTpl *op);
virtual void setLabel(OpTpl *op);
virtual void appendCrossBuild(OpTpl *bld,int4 secnum);
};
/// \brief A full SLEIGH engine
///
/// Its provided with a LoadImage of the bytes to be disassembled and
/// a ContextDatabase.
///
/// Assembly is produced via the printAssembly() method, provided with an
/// AssemblyEmit object and an Address.
///
/// P-code is produced via the oneInstruction() method, provided with a PcodeEmit
/// object and an Address.
class Sleigh : public SleighBase {
LoadImage *loader; ///< The mapped bytes in the program
ContextDatabase *context_db; ///< Database of context values steering disassembly
ContextCache *cache; ///< Cache of recently used context values
mutable DisassemblyCache *discache; ///< Cache of recently parsed instructions
mutable PcodeCacher pcode_cache; ///< Cache of p-code data just prior to emitting
void clearForDelete(void); ///< Delete the context and disassembly caches
protected:
ParserContext *obtainContext(const Address &addr,int4 state) const;
void resolve(ParserContext &pos) const; ///< Generate a parse tree suitable for disassembly
void resolveHandles(ParserContext &pos) const; ///< Prepare the parse tree for p-code generation
public:
Sleigh(LoadImage *ld,ContextDatabase *c_db); ///< Constructor
virtual ~Sleigh(void); ///< Destructor
void reset(LoadImage *ld,ContextDatabase *c_db); ///< Reset the engine for a new program
virtual void initialize(DocumentStorage &store);
virtual void registerContext(const string &name,int4 sbit,int4 ebit);
virtual void setContextDefault(const string &nm,uintm val);
virtual void allowContextSet(bool val) const;
virtual int4 instructionLength(const Address &baseaddr) const;
virtual int4 oneInstruction(PcodeEmit &emit,const Address &baseaddr) const;
virtual int4 printAssembly(AssemblyEmit &emit,const Address &baseaddr) const;
};
/** \page sleigh SLEIGH
\section sleightoc Table of Contents
- \ref sleighoverview
- \ref sleighbuild
- \ref sleighuse
- \subpage sleighAPIbasic
- \subpage sleighAPIemulate
\b Key \b Classes
- \ref Translate
- \ref AssemblyEmit
- \ref PcodeEmit
- \ref LoadImage
- \ref ContextDatabase
\section sleighoverview Overview
Welcome to \b SLEIGH, a machine language translation and
dissassembly engine. SLEIGH is both a processor
specification language and the associated library and
tools for using such a specification to generate assembly
and to generate \b pcode, a reverse engineering Register
Transfer Language (RTL), from binary machine instructions.
SLEIGH was originally based on \b SLED, a
\e Specification \e Language \e for \e Encoding \e and
\e Decoding, designed by Norman Ramsey and Mary F. Fernandez,
which performed disassembly (and assembly). SLEIGH
extends SLED by providing semantic descriptions (via the
RTL) of machine instructions and other practical enhancements
for doing real world reverse engineering.
SLEIGH is part of Project \b GHIDRA. It provides the core
of the GHIDRA disassembler and the data-flow and
decompilation analysis. However, SLEIGH can serve as a
standalone library for use in other applications for
providing a generic disassembly and RTL translation interface.
\section sleighbuild Building SLEIGH
There are a couple of \e make targets for building the SLEIGH
library from source. These are:
\code
make libsla.a # Build the main library
make libsla_dbg.a # Build the library with debug symbols
\endcode
The source code file \e sleighexample.cc has a complete example
of initializing the Translate engine and using it to generate
assembly and pcode. The source has a hard-coded file name,
\e x86testcode, as the example binary executable it attempts
to decode, but this can easily be changed. It also needs
a SLEIGH specification file (\e .sla) to be present.
Building the example application can be done with something
similar to the following makefile fragment.
\code
# The C compiler
CXX=g++
# Debug flags
DBG_CXXFLAGS=-g -Wall -Wno-sign-compare
OPT_CXXFLAGS=-O2 -Wall -Wno-sign-compare
# libraries
INCLUDES=-I./src
LNK=src/libsla_dbg.a
sleighexample.o: sleighexample.cc
$(CXX) -c $(DBG_CXXFLAGS) -o sleighexample sleighexample.o $(LNK)
clean:
rm -rf *.o sleighexample
\endcode
\section sleighuse Using SLEIGH
SLEIGH is a generic reverse engineering tool in the sense
that the API is designed to be completely processor
independent. In order to process binary executables for a
specific processor, The library reads in a \e
specification \e file, which describes how instructions
are encoded and how they are interpreted by the processor.
An application which needs to do disassembly or generate
\b pcode can design to the SLEIGH API once, and then the
application will automatically support any processor for
which there is a specification.
For working with a single processor, the SLEIGH library
needs to load a single \e compiled form of the processor
specification, which is traditionally given a ".sla" suffix.
Most common processors already have a ".sla" file available.
So to use SLEIGH with these processors, the library merely
needs to be made aware of the desired file. This documentation
covers the use of the SLEIGH API, assuming that this
specification file is available.
The ".sla" files themselves are created by running
the \e compiler on a file written in the formal SLEIGH
language. These files traditionally have the suffix ".slaspec"
For those who want to design such a specification for a new
processor, please refer to the document, "SLEIGH: A Language
for Rapid Processor Specification."
*/
/**
\page sleighAPIbasic The Basic SLEIGH Interface
To use SLEIGH as a library within an application, there
are basically five classes that you need to be aware of.
- \ref sleightranslate
- \ref sleighassememit
- \ref sleighpcodeemit
- \ref sleighloadimage
- \ref sleighcontext
\section sleightranslate Translate (or Sleigh)
The core SLEIGH class is Sleigh, which is derived from the
interface, Translate. In order to instantiate it in your code,
you need a LoadImage object, and a ContextDatabase object.
The load image is responsible for retrieving instruction
bytes, based on address, from a binary executable. The context
database provides the library extra mode information that may
be necessary to do the disassembly or translation. This can
be used, for instance, to specify that an x86 binary is running
in 32-bit mode, or to specify that an ARM processor is running
in THUMB mode. Once these objects are built, the Sleigh
object can be immediately instantiated.
\code
LoadImageBfd *loader;
ContextDatabase *context;
Translate *trans;
// Set up the loadimage
// Providing an executable name and architecture
string loadimagename = "x86testcode";
string bfdtarget= "default";
loader = new LoadImageBfd(loadimagename,bfdtarget);
loader->open(); // Load the executable from file
context = new ContextInternal(); // Create a processor context
trans = new Sleigh(loader,context); // Instantiate the translator
\endcode
Once the Sleigh object is in hand, the only required
initialization step left is to inform it of the ".sla" file.
The file is in XML format and needs to be read in using
SLEIGH's built-in XML parser. The following code accomplishes
this.
\code
string sleighfilename = "specfiles/x86.sla";
DocumentStorage docstorage;
Element *sleighroot = docstorage.openDocument(sleighfilename)->getRoot();
docstorage.registerTag(sleighroot);
trans->initialize(docstorage); // Initialize the translator
\endcode
\section sleighassememit AssemblyEmit
In order to do disassembly, you need to derive a class from
AssemblyEmit, and implement the method \e dump. The library
will call this method exactly once, for each instruction
disassembled.
This routine simply needs to decide how (and where) to print
the corresponding portion of the disassembly. For instance,
\code
class AssemblyRaw : public AssemblyEmit {
public:
virtual void dump(const Address &addr,const string &mnem,const string &body) {
addr.printRaw(cout);
cout << ": " << mnem << ' ' << body << endl;
}
};
\endcode
This is a minimal implementation that simply dumps the
disassembly straight to standard out. Once this object is
instantiated, the Sleigh object can use it to write out
assembly via the Translate::printAssembly() method.
\code
AssemblyEmit *assememit = new AssemblyRaw();
Address addr(trans->getDefaultCodeSpace(),0x80484c0);
int4 length; // Length of instruction in bytes
length = trans->printAssembly(*assememit,addr);
addr = addr + length; // Advance to next instruction
length = trans->printAssembly(*assememit,addr);
addr = addr + length;
length = trans->printAssembly(*assememit,addr);
\endcode
\section sleighpcodeemit PcodeEmit
In order to generate a \b pcode translation of a machine
instruction, you need to derive a class from PcodeEmit and
implement the virtual method \e dump. This method will be
invoked once for each \b pcode operation in the translation
of a machine instruction. There will likely be multiple calls
per instruction. Each call passes in a single \b pcode
operation, complete with its possible varnode output, and
all of its varnode inputs. Here is an example of a PcodeEmit
object that simply prints out the \b pcode.
\code
class PcodeRawOut : public PcodeEmit {
public:
virtual void dump(const Address &addr,OpCode opc,VarnodeData *outvar,VarnodeData *vars,int4 isize);
};
static void print_vardata(ostream &s,VarnodeData &data)
{
s << '(' << data.space->getName() << ',';
data.space->printOffset(s,data.offset);
s << ',' << dec << data.size << ')';
}
void PcodeRawOut::dump(const Address &addr,OpCode opc,VarnodeData *outvar,VarnodeData *vars,int4 isize)
{
if (outvar != (VarnodeData *)0) { // The output is optional
print_vardata(cout,*outvar);
cout << " = ";
}
cout << get_opname(opc);
// Possibly check for a code reference or a space reference
for(int4 i=0;i<isize;++i) {
cout << ' ';
print_vardata(cout,vars[i]);
}
cout << endl;
}
\endcode
Notice that the \e dump routine uses the built-in function
\e get_opname to find a string version of the opcode. Each
varnode is defined in terms of the VarnodeData object, which
is defined simply:
\code
struct VarnodeData {
AddrSpace *space; // The address space
uintb offset; // The offset within the space
uint4 size; // The number of bytes at that location
};
\endcode
Once the PcodeEmit object is instantiated, the Sleigh object can
use it to generate pcode, one instruction at a time, using the
Translate::oneInstruction() const method.
\code
PcodeEmit *pcodeemit = new PcodeRawOut();
Address addr(trans->getDefaultCodeSpace(),0x80484c0);
int4 length; // Length of instruction in bytes
length = trans->oneInstruction(*pcodeemit,addr);
addr = addr + length; // Advance to next instruction
length = trans->oneInstruction(*pcodeemit,addr);
addr = addr + length;
length = trans->oneInstruction(*pcodeemit,addr);
\endcode
For an application to properly \e follow \e flow, while translating
machine instructions into pcode, the emitted pcode must be
inspected for the various branch operations.
\section sleighloadimage LoadImage
A LoadImage holds all the binary data from an executable file
in the format similar to how it would exist when being executed
by a real processor. The interface to this from SLEIGH is
actually very simple, although it can hide a complicated
structure. One method does most of the work, LoadImage::loadFill().
It takes a byte pointer, a size, and an Address. The method
is expected to fill in the \e ptr array with \e size bytes
taken from the load image, corresponding to the address \e addr.
There are two more virtual methods that are required for a
complete implementation of LoadImage, \e getArchType and
\e adjustVma, but these do not need to be implemented fully.
\code
class MyLoadImage : public LoadImage {
public:
MyLoadImage(const string &nm) : Loadimage(nm) {}
virtual void loadFill(uint1 *ptr,int4 size,const Address &addr);
virtual string getArchType(void) const { return "mytype"; }
virtual void adjustVma(long adjust) {}
};
\endcode
\section sleighcontext ContextDatabase
The ContextDatabase needs to keep track of any possible
context variable and its value, over different address ranges.
In most cases, you probably don't need to override the class
yourself, but can use the built-in class, ContextInternal.
This provides the basic functionality required and will work
for different architectures. What you may need to do is
set values for certain variables, depending on the processor
and the environment it is running in. For instance, for
the x86 platform, you need to set the \e addrsize and \e opsize
bits, to indicate the processor would be running in 32-bit
mode. The context variables specific to a particular processor
are established by the SLEIGH spec. So the variables can
only be set \e after the spec has been loaded.
\code
...
context = new ContextInternal();
trans = new Sleigh(loader,context);
DocumentStorage docstorage;
Element *root = docstorage.openDocument("specfiles/x86.sla")->getRoot();
docstorage.registerTag(root);
trans->initialize(docstorage);
context->setVariableDefault("addrsize",1); // Address size is 32-bits
context->setVariableDefault("opsize",1); // Operand size is 32-bits
\endcode
*/
#endif