Initial commit of new StringManager

This commit is contained in:
caheckman 2020-04-17 18:37:40 -04:00
parent 6a15520aa5
commit 0eb48e441f
19 changed files with 740 additions and 236 deletions

View file

@ -271,6 +271,7 @@ model {
include "database.cc"
include "cpool.cc"
include "comment.cc"
include "stringmanage.cc"
include "fspec.cc"
include "action.cc"
include "loadimage.cc"
@ -321,6 +322,7 @@ model {
include "cpool_ghidra.cc"
include "ghidra_process.cc"
include "comment_ghidra.cc"
include "string_ghidra.cc"
// include "callgraph.cc" // uncomment for debug
// include "ifacedecomp.cc" // uncomment for debug
// include "ifaceterm.cc" // uncomment for debug

View file

@ -75,7 +75,7 @@ EXTERNAL_CONSOLEEXT_NAMES=$(subst .cc,,$(notdir $(EXTERNAL_CONSOLEEXT_SOURCE)))
CORE= xml space float address pcoderaw translate opcodes globalcontext
# Additional core files for any projects that decompile
DECCORE=capability architecture options graph cover block cast typeop database cpool \
comment fspec action loadimage grammar varnode op \
comment stringmanage fspec action loadimage grammar varnode op \
type variable varmap jumptable emulate emulateutil flow userop \
funcdata funcdata_block funcdata_op funcdata_varnode pcodeinject \
heritage prefersplit rangeutil ruleaction subflow blockaction merge double \
@ -87,7 +87,7 @@ SLEIGH= sleigh pcodeparse pcodecompile sleighbase slghsymbol \
# Additional files for the GHIDRA specific build
GHIDRA= ghidra_arch inject_ghidra ghidra_translate loadimage_ghidra \
typegrp_ghidra database_ghidra ghidra_context cpool_ghidra \
ghidra_process comment_ghidra $(GHIDRAEXT_NAMES)
ghidra_process comment_ghidra string_ghidra $(GHIDRAEXT_NAMES)
# Additional files specific to the sleigh compiler
SLACOMP=slgh_compile slghparse slghscan
# Additional special files that should not be considered part of the library

View file

@ -100,6 +100,7 @@ Architecture::Architecture(void)
loader = (LoadImage *)0;
pcodeinjectlib = (PcodeInjectLibrary *)0;
commentdb = (CommentDatabase *)0;
stringManager = (StringManager *)0;
cpool = (ConstantPool *)0;
symboltab = new Database(this);
context = (ContextDatabase *)0;
@ -152,6 +153,8 @@ Architecture::~Architecture(void)
delete pcodeinjectlib;
if (commentdb != (CommentDatabase *)0)
delete commentdb;
if (stringManager != (StringManager *)0)
delete stringManager;
if (cpool != (ConstantPool *)0)
delete cpool;
if (context != (ContextDatabase *)0)
@ -268,6 +271,7 @@ void Architecture::clearAnalysis(Funcdata *fd)
fd->clear(); // Clear stuff internal to function
// Clear out any analysis generated comments
commentdb->clearType(fd->getAddress(),Comment::warning|Comment::warningheader);
stringManager->clear();
}
/// Symbols do not necessarily need to be available for the decompiler.
@ -405,6 +409,7 @@ void Architecture::saveXml(ostream &s) const
symboltab->saveXml(s);
context->saveXml(s);
commentdb->saveXml(s);
stringManager->saveXml(s);
if (!cpool->empty())
cpool->saveXml(s);
s << "</save_state>\n";
@ -437,6 +442,8 @@ void Architecture::restoreXml(DocumentStorage &store)
context->restoreXml(subel,this);
else if (subel->getName() == "commentdb")
commentdb->restoreXml(subel,this);
else if (subel->getName() == "stringmanage")
stringManager->restoreXml(subel,this);
else if (subel->getName() == "constantpool")
cpool->restoreXml(subel,*types);
else if (subel->getName() == "optionslist")
@ -575,6 +582,14 @@ void Architecture::buildCommentDB(DocumentStorage &store)
commentdb = new CommentDatabaseInternal();
}
/// Build container that holds decoded strings
/// \param store may hold configuration information
void Architecture::buildStringManager(DocumentStorage &store)
{
stringManager = new StringManagerUnicode(this,2048);
}
/// Some processor models (Java byte-code) need a database of constants.
/// The database is always built, but may remain empty.
/// \param store may hold configuration information
@ -1237,6 +1252,7 @@ void Architecture::init(DocumentStorage &store)
buildContext(store);
buildTypegrp(store);
buildCommentDB(store);
buildStringManager(store);
buildConstantPool(store);
restoreFromSpec(store);

View file

@ -28,6 +28,7 @@
#include "loadimage.hh"
#include "globalcontext.hh"
#include "comment.hh"
#include "stringmanage.hh"
#include "userop.hh"
#include "options.hh"
#include "transform.hh"
@ -147,6 +148,7 @@ public:
PcodeInjectLibrary *pcodeinjectlib; ///< Pcode injection manager
RangeList nohighptr; ///< Ranges for which high-level pointers are not possible
CommentDatabase *commentdb; ///< Comments for this architecture
StringManager *stringManager; ///< Manager of decoded strings
ConstantPool *cpool; ///< Deferred constant values
PrintLanguage *print; ///< Current high-level language printer
vector<PrintLanguage *> printlist; ///< List of high-level language printers supported
@ -227,6 +229,7 @@ protected:
virtual void buildTypegrp(DocumentStorage &store); ///< Build the data-type factory/container
virtual void buildCommentDB(DocumentStorage &store); ///< Build the comment database
virtual void buildStringManager(DocumentStorage &store); ///< Build the string manager
virtual void buildConstantPool(DocumentStorage &store); ///< Build the constant pool
virtual void buildInstructions(DocumentStorage &store); ///< Register the p-code operations
virtual void buildAction(DocumentStorage &store); ///< Build the Action framework

View file

@ -615,6 +615,52 @@ void ArchitectureGhidra::getBytes(uint1 *buf,int4 size,const Address &inaddr)
readResponseEnd(sin);
}
uint4 ArchitectureGhidra::getStringData(uint1 *buf,const Address &addr,Datatype *ct,int4 maxBytes)
{
sout.write("\000\000\001\004",4);
writeStringStream(sout,"getString");
sout.write("\000\000\001\016",4); // Beginning of string header
addr.saveXml(sout,maxBytes);
sout.write("\000\000\001\017",4);
writeStringStream(sout,ct->getName());
sout.write("\000\000\001\016",4); // Beginning of string header
sout << dec << (int8)ct->getId(); // Pass as a signed integer
sout.write("\000\000\001\017",4);
sout.write("\000\000\001\005",4);
sout.flush();
readToResponse(sin);
int4 type = readToAnyBurst(sin);
uint4 size = 0;
if (type == 12) {
int4 c = sin.get();
size ^= (c-0x20);
c = sin.get();
size ^= ((c-0x20)<<6);
uint1 *dblbuf = new uint1[size * 2];
sin.read((char *)dblbuf,size*2);
for (int4 i=0; i < size; i++) {
buf[i] = ((dblbuf[i*2]-'A') << 4) | (dblbuf[i*2 + 1]-'A');
}
delete [] dblbuf;
}
else if ((type&1)==1) {
ostringstream errmsg;
errmsg << "GHIDRA has no string in the loadimage at " << addr.getShortcut();
addr.printRaw(errmsg);
throw DataUnavailError(errmsg.str());
}
else
throw JavaError("alignment","Expecting bytes or end of query response");
type = readToAnyBurst(sin);
if (type != 13)
throw JavaError("alignment","Expecting byte alignment end");
readResponseEnd(sin);
return size;
}
/// \brief Retrieve p-code to inject for a specific context
///
/// The particular injection is named and is of one of the types:

View file

@ -124,6 +124,7 @@ public:
bool getSendParamMeasures(void) const { return sendParamMeasures; } ///< Get the current setting for emitting parameter info
virtual uint4 getStringData(uint1 *buf,const Address &addr,Datatype *ct,int4 maxBytes);
virtual void printMessage(const string &message) const;
static void segvHandler(int4 sig); ///< Handler for a segment violation (SIGSEGV) signal

View file

@ -245,6 +245,7 @@ void FlushNative::rawAction(void)
ghidra->symboltab->deleteSubScopes(globscope); // Flush cached function and globals database
ghidra->types->clearNoncore(); // Reset type information
ghidra->commentdb->clear(); // Clear any comments
ghidra->stringManager->clear(); // Clear string decodings
ghidra->cpool->clear();
res = 0;
}

View file

@ -1164,7 +1164,7 @@ void PrintC::printUnicode(ostream &s,int4 onechar) const
s << "\\x" << setfill('0') << setw(8) << hex << onechar;
return;
}
writeUtf8(s, onechar); // emit normally
StringManager::writeUtf8(s, onechar); // emit normally
}
void PrintC::pushType(const Datatype *ct)
@ -1204,32 +1204,6 @@ bool PrintC::doEmitWideCharPrefix(void) const
return true;
}
/// \brief Check if the byte buffer has a (unicode) string terminator
///
/// \param buffer is the byte buffer
/// \param size is the number of bytes in the buffer
/// \param charsize is the presumed size (in bytes) of character elements
/// \return \b true if a string terminator is found
bool PrintC::hasCharTerminator(uint1 *buffer,int4 size,int4 charsize)
{
for(int4 i=0;i<size;i+=charsize) {
bool isTerminator = true;
for(int4 j=0;j<charsize;++j) {
if (buffer[i+j] != 0) { // Non-zero bytes means character can't be a null terminator
isTerminator = false;
break;
}
}
if (isTerminator) return true;
}
return false;
}
#define STR_LITERAL_BUFFER_MAXSIZE 2048
#define STR_LITERAL_BUFFER_INCREMENT 32
/// \brief Print a quoted (unicode) string at the given address.
///
/// Data for the string is obtained directly from the LoadImage. The bytes are checked
@ -1237,43 +1211,27 @@ bool PrintC::hasCharTerminator(uint1 *buffer,int4 size,int4 charsize)
/// pass, the string is emitted.
/// \param s is the output stream to print to
/// \param addr is the address of the string data within the LoadImage
/// \param charsize is the number of bytes in an encoded element (i.e. UTF8, UTF16, or UTF32)
/// \param charType is the underlying character data-type
/// \return \b true if a proper string was found and printed to the stream
bool PrintC::printCharacterConstant(ostream &s,const Address &addr,int4 charsize) const
bool PrintC::printCharacterConstant(ostream &s,const Address &addr,Datatype *charType) const
{
uint1 buffer[STR_LITERAL_BUFFER_MAXSIZE+4]; // Additional buffer for get_codepoint skip readahead
int4 curBufferSize = 0;
bool foundTerminator = false;
const uint1 *buffer;
StringManager *manager = glb->stringManager;
try {
do {
uint4 newBufferSize = curBufferSize + STR_LITERAL_BUFFER_INCREMENT;
glb->loader->loadFill(buffer+curBufferSize,STR_LITERAL_BUFFER_INCREMENT,addr + curBufferSize);
foundTerminator = hasCharTerminator(buffer+curBufferSize,STR_LITERAL_BUFFER_INCREMENT,charsize);
curBufferSize = newBufferSize;
} while ((curBufferSize < STR_LITERAL_BUFFER_MAXSIZE)&&(!foundTerminator));
buffer = manager->getStringData(addr, charType);
} catch(DataUnavailError &err) {
return false;
}
buffer[curBufferSize] = 0; // Make sure bytes for final codepoint read are initialized
buffer[curBufferSize+1] = 0;
buffer[curBufferSize+2] = 0;
buffer[curBufferSize+3] = 0;
bool bigend = glb->translate->isBigEndian();
bool res;
if (isCharacterConstant(buffer,curBufferSize,charsize)) {
if (doEmitWideCharPrefix() && charsize > 1)
if (doEmitWideCharPrefix() && charType->getSize() > 1)
s << 'L'; // Print symbol indicating wide character
s << '"';
if (!escapeCharacterData(s,buffer,curBufferSize,charsize,bigend))
if (!escapeCharacterData(s,buffer,manager->getMaximumBytes(),charType->getSize(),glb->translate->isBigEndian()))
s << "...\" /* TRUNCATED STRING LITERAL */";
else s << '"';
res = true;
}
else
res = false;
return res;
s << '"';
return true;
}
void PrintC::resetDefaultsPrintC(void)
@ -1373,7 +1331,7 @@ bool PrintC::pushPtrCharConstant(uintb val,const TypePointer *ct,const Varnode *
ostringstream str;
Datatype *subct = ct->getPtrTo();
if (!printCharacterConstant(str,stringaddr,subct->getSize()))
if (!printCharacterConstant(str,stringaddr,subct))
return false; // Can we get a nice ASCII string
pushAtom(Atom(str.str(),vartoken,EmitXml::const_color,op,vn));
@ -1577,7 +1535,7 @@ void PrintC::pushSymbol(const Symbol *sym,const Varnode *vn,const PcodeOp *op)
SymbolEntry *entry = sym->getFirstWholeMap();
if (entry != (SymbolEntry *)0) {
ostringstream s;
if (printCharacterConstant(s,entry->getAddr(),subct->getSize())) {
if (printCharacterConstant(s,entry->getAddr(),subct)) {
pushAtom(Atom(s.str(),vartoken,EmitXml::const_color,op,vn));
return;
}
@ -1963,25 +1921,6 @@ void PrintC::setCommentStyle(const string &nm)
throw LowlevelError("Unknown comment style. Use \"c\" or \"cplusplus\"");
}
bool PrintC::isCharacterConstant(const uint1 *buf,int4 size,int4 charsize) const
{
// Return true if this looks like a c-string
// If the string is encoded in UTF8 or ASCII, we get (on average) a bit of check
// per character. For UTF16, the surrogate reserved area gives at least some check.
if (buf == (const uint1 *)0) return false;
bool bigend = glb->translate->isBigEndian();
int4 i=0;
int4 skip = charsize;
while(i<size) {
int4 codepoint = getCodepoint(buf+i,charsize,bigend,skip);
if (codepoint < 0) return false;
if (codepoint == 0) break;
i += skip;
}
return true;
}
/// \brief Emit the definition of the given data-type
///
/// This is currently limited to a 'struct' or 'enum' definitions. The

View file

@ -157,8 +157,7 @@ protected:
void opFunc(const PcodeOp *op); ///< Push a \e functional expression based on the given p-code op to the RPN stack
void opTypeCast(const PcodeOp *op); ///< Push the given p-code op using type-cast syntax to the RPN stack
void opHiddenFunc(const PcodeOp *op); ///< Push the given p-code op as a hidden token
static bool hasCharTerminator(uint1 *buffer,int4 size,int4 charsize);
bool printCharacterConstant(ostream &s,const Address &addr,int4 charsize) const;
bool printCharacterConstant(ostream &s,const Address &addr,Datatype *charType) const;
void resetDefaultsPrintC(void); ///< Set default values for options specific to PrintC
virtual void pushConstant(uintb val,const Datatype *ct,
const Varnode *vn,const PcodeOp *op);
@ -204,7 +203,6 @@ public:
virtual void resetDefaults(void);
virtual void adjustTypeOperators(void);
virtual void setCommentStyle(const string &nm);
virtual bool isCharacterConstant(const uint1 *buf,int4 size,int4 charsize) const;
virtual void docTypeDefinitions(const TypeFactory *typegrp);
virtual void docAllGlobals(void);
virtual void docSingleGlobal(const Symbol *sym);

View file

@ -190,7 +190,7 @@ void PrintJava::printUnicode(ostream &s,int4 onechar) const
s << "\\ux" << setfill('0') << setw(8) << hex << onechar;
return;
}
writeUtf8(s, onechar); // Emit normally
StringManager::writeUtf8(s, onechar); // Emit normally
}
void PrintJava::opLoad(const PcodeOp *op)

View file

@ -478,136 +478,6 @@ bool PrintLanguage::unicodeNeedsEscape(int4 codepoint)
return false;
}
/// Encode the given unicode codepoint as UTF8 (1, 2, 3, or 4 bytes) and
/// write the bytes to the stream.
/// \param s is the output stream
/// \param codepoint is the unicode codepoint
void PrintLanguage::writeUtf8(ostream &s,int4 codepoint)
{
uint1 bytes[4];
int4 size;
if (codepoint < 0)
throw LowlevelError("Negative unicode codepoint");
if (codepoint < 128) {
s.put((uint1)codepoint);
return;
}
int4 bits = mostsigbit_set(codepoint) + 1;
if (bits > 21)
throw LowlevelError("Bad unicode codepoint");
if (bits < 12) { // Encode with two bytes
bytes[0] = 0xc0 ^ ((codepoint >> 6)&0x1f);
bytes[1] = 0x80 ^ (codepoint & 0x3f);
size = 2;
}
else if (bits < 17) {
bytes[0] = 0xe0 ^ ((codepoint >> 12)&0xf);
bytes[1] = 0x80 ^ ((codepoint >> 6)&0x3f);
bytes[2] = 0x80 ^ (codepoint & 0x3f);
size = 3;
}
else {
bytes[0] = 0xf0 ^ ((codepoint >> 18) & 7);
bytes[1] = 0x80 ^ ((codepoint >> 12) & 0x3f);
bytes[2] = 0x80 ^ ((codepoint >> 6) & 0x3f);
bytes[3] = 0x80 ^ (codepoint & 0x3f);
size = 4;
}
s.write((char *)bytes, size);
}
/// Pull the first two bytes from the byte array and combine them in the indicated endian order
/// \param buf is the byte array
/// \param bigend is \b true to request big endian encoding
/// \return the decoded UTF16 element
inline int4 PrintLanguage::readUtf16(const uint1 *buf,bool bigend)
{
int4 codepoint;
if (bigend) {
codepoint = buf[0];
codepoint <<= 8;
codepoint += buf[1];
}
else {
codepoint = buf[1];
codepoint <<= 8;
codepoint += buf[0];
}
return codepoint;
}
/// \brief Extract the next \e unicode \e codepoint from an array of character data
///
/// One or more bytes is consumed from the array, and the number of bytes used is passed back.
/// \param buf is a pointer to the bytes in the character array
/// \param charsize is 1 for UTF8, 2 for UTF16, or 4 for UTF32
/// \param bigend is \b true for big endian encoding of the UTF element
/// \param skip is a reference for passing back the number of bytes consumed
/// \return the codepoint or -1 if the encoding is invalid
int4 PrintLanguage::getCodepoint(const uint1 *buf,int4 charsize,bool bigend,int4 &skip)
{
int4 codepoint;
int4 sk = 0;
if (charsize==2) { // UTF-16
codepoint = readUtf16(buf,bigend);
sk += 2;
if ((codepoint>=0xD800)&&(codepoint<=0xDBFF)) { // high surrogate
int4 trail=readUtf16(buf+2,bigend);
sk += 2;
if ((trail<0xDC00)||(trail>0xDFFF)) return -1; // Bad trail
codepoint = (codepoint<<10) + trail + (0x10000 - (0xD800 << 10) - 0xDC00);
}
else if ((codepoint>=0xDC00)&&(codepoint<=0xDFFF)) return -1; // trail before high
}
else if (charsize==1) { // UTF-8
int4 val = buf[0];
if ((val&0x80)==0) {
codepoint = val;
sk = 1;
}
else if ((val&0xe0)==0xc0) {
int4 val2 = buf[1];
sk = 2;
if ((val2&0xc0)!=0x80) return -1; // Not a valid UTF8-encoding
codepoint = ((val&0x1f)<<6) | (val2 & 0x3f);
}
else if ((val&0xf0)==0xe0) {
int4 val2 = buf[1];
int4 val3 = buf[2];
sk = 3;
if (((val2&0xc0)!=0x80)||((val3&0xc0)!=0x80)) return -1; // invalid encoding
codepoint = ((val&0xf)<<12) | ((val2&0x3f)<<6) | (val3 & 0x3f);
}
else if ((val&0xf8)==0xf0) {
int4 val2 = buf[1];
int4 val3 = buf[2];
int4 val4 = buf[3];
sk = 4;
if (((val2&0xc0)!=0x80)||((val3&0xc0)!=0x80)||((val4&0xc0)!=0x80)) return -1; // invalid encoding
codepoint = ((val&7)<<18) | ((val2&0x3f)<<12) | ((val3&0x3f)<<6) | (val4 & 0x3f);
}
else
return -1;
}
else if (charsize == 4) { // UTF-32
sk = 4;
if (bigend)
codepoint = (buf[0]<<24) + (buf[1]<<16) + (buf[2]<<8) + buf[3];
else
codepoint = (buf[3]<<24) + (buf[2]<<16) + (buf[1]<<8) + buf[0];
}
else
return -1;
if (codepoint >= 0xd800 && codepoint <= 0xdfff)
return -1; // Reserved for surrogates, invalid codepoints
skip = sk;
return codepoint;
}
/// \brief Emit a byte buffer to the stream as unicode characters.
///
/// Characters are emitted until we reach a terminator character or \b count bytes is consumed.
@ -624,7 +494,7 @@ bool PrintLanguage::escapeCharacterData(ostream &s,const uint1 *buf,int4 count,i
int4 skip = charsize;
int4 codepoint = 0;
while(i<count) {
codepoint = getCodepoint(buf+i,charsize,bigend,skip);
codepoint = StringManager::getCodepoint(buf+i,charsize,bigend,skip);
if (codepoint == 0 || codepoint == -1) break;
printUnicode(s,codepoint);
i += skip;

View file

@ -267,9 +267,6 @@ protected:
void emitOp(const ReversePolish &entry); ///< Send an operator token from the RPN to the emitter
void emitAtom(const Atom &atom); ///< Send an variable token from the RPN to the emitter
static bool unicodeNeedsEscape(int4 codepoint); ///< Determine if the given codepoint needs to be escaped
static void writeUtf8(ostream &s,int4 codepoint); ///< Write unicode character to stream in UTF8 encoding
static int4 readUtf16(const uint1 *buf,bool bigend); ///< Read a 2-byte UTF16 element from a byte array
static int4 getCodepoint(const uint1 *buf,int4 charsize,bool bigend,int4 &skip);
bool escapeCharacterData(ostream &s,const uint1 *buf,int4 count,int4 charsize,bool bigend) const;
void recurse(void); ///< Emit from the RPN stack as much as possible
void opBinary(const OpToken *tok,const PcodeOp *op); ///< Push a binary operator onto the RPN stack
@ -434,14 +431,6 @@ public:
/// \param nm is the configuration description
virtual void setCommentStyle(const string &nm)=0;
/// \brief Decide is the given byte array looks like a character string
///
/// This looks for encodings and/or a terminator that is appropriate for the high-level language
/// \param buf is a pointer to the byte array
/// \param size is the number of bytes in the array
/// \param charsize is the size in bytes of the encoding element (i.e. UTF8, UTF16, etc.) to assume
virtual bool isCharacterConstant(const uint1 *buf,int4 size,int4 charsize) const=0;
/// \brief Emit definitions of data-types
///
/// \param typegrp is the container for the data-types that should be defined

View file

@ -6409,14 +6409,8 @@ int4 RulePtrsubCharConstant::applyOp(PcodeOp *op,Funcdata &data)
if (!scope->isReadOnly(symaddr,1,op->getAddr()))
return 0;
// Check if data at the address looks like a string
uint1 buffer[128];
try {
data.getArch()->loader->loadFill(buffer,128,symaddr);
} catch(DataUnavailError &err) {
if (!data.getArch()->stringManager->isString(symaddr, basetype))
return 0;
}
bool isstring = data.getArch()->print->isCharacterConstant(buffer,128,basetype->getSize());
if (!isstring) return 0;
// If we reach here, the PTRSUB should be converted to a (COPY of a) pointer constant.
bool removeCopy = false;

View file

@ -0,0 +1,41 @@
/* ###
* IP: GHIDRA
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ghidra_string.hh"
GhidraStringManager::GhidraStringManager(ArchitectureGhidra *g,int4 max)
: StringManager(max)
{
glb = g;
testBuffer = new uint1[max];
}
GhidraStringManager::~GhidraStringManager(void)
{
delete [] testBuffer;
}
const uint1 *GhidraStringManager::getStringData(const Address &addr,Datatype *charType)
{
map<Address,const uint1 *>::iterator iter;
iter = stringMap.find(addr);
if (iter != stringMap.end())
return (*iter).second;
int4 size = glb->getStringData(testBuffer, addr, charType, maximumBytes);
return mapBuffer(addr, testBuffer, size);
}

View file

@ -0,0 +1,39 @@
/* ###
* IP: GHIDRA
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// \file ghidra_string.hh
/// \brief Implementation of the StringManager through the ghidra client
#ifndef __STRING_GHIDRA__
#define __STRING_GHIDRA__
#include "ghidra_arch.hh"
/// \brief Implementation of the StringManager that queries through the ghidra client
///
/// This acts as a front end to Ghidra's string formats and encodings.
/// The client translates any type of string into a UTF8 representation, and this
/// class stores it for final presentation. Escaping the UTF8 string is left up
/// to the PrintLanguage.
class GhidraStringManager : public StringManager {
ArchitectureGhidra *glb; ///< The ghidra client interface
uint1 *testBuffer; ///< Temporary storage for storing bytes from client
public:
GhidraStringManager(ArchitectureGhidra *g,int4 max); ///< Constructor
virtual ~GhidraStringManager(void);
virtual const uint1 *getStringData(const Address &addr,Datatype *charType);
};
#endif

View file

@ -0,0 +1,391 @@
/* ###
* IP: GHIDRA
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "stringmanage.hh"
#include "architecture.hh"
/// Before calling, we must check that there is no other buffer stored at the address.
/// \param addr is the Address to store the buffer at
/// \param buf is the buffer to be copied into storage
/// \param size is the number of bytes in the buffer
/// \return the new permanent copy of the buffer
const uint1 *StringManager::mapBuffer(const Address &addr,const uint1 *buf,int4 size)
{
uint1 *storeBuf = new uint1[size + 1];
stringMap[addr] = storeBuf;
memcpy(storeBuf,buf,size);
storeBuf[size] = 0;
return storeBuf;
}
/// \param max is the maximum number of bytes to allow in a decoded string
StringManager::StringManager(int4 max)
{
maximumBytes = max;
}
StringManager::~StringManager(void)
{
clear();
}
void StringManager::clear(void)
{
map<Address,const uint1 *>::iterator iter;
for(iter=stringMap.begin();iter!=stringMap.end();++iter) {
delete [] (*iter).second;
}
}
/// Encode the given unicode codepoint as UTF8 (1, 2, 3, or 4 bytes) and
/// write the bytes to the stream.
/// \param s is the output stream
/// \param codepoint is the unicode codepoint
void StringManager::writeUtf8(ostream &s,int4 codepoint)
{
uint1 bytes[4];
int4 size;
if (codepoint < 0)
throw LowlevelError("Negative unicode codepoint");
if (codepoint < 128) {
s.put((uint1)codepoint);
return;
}
int4 bits = mostsigbit_set(codepoint) + 1;
if (bits > 21)
throw LowlevelError("Bad unicode codepoint");
if (bits < 12) { // Encode with two bytes
bytes[0] = 0xc0 ^ ((codepoint >> 6)&0x1f);
bytes[1] = 0x80 ^ (codepoint & 0x3f);
size = 2;
}
else if (bits < 17) {
bytes[0] = 0xe0 ^ ((codepoint >> 12)&0xf);
bytes[1] = 0x80 ^ ((codepoint >> 6)&0x3f);
bytes[2] = 0x80 ^ (codepoint & 0x3f);
size = 3;
}
else {
bytes[0] = 0xf0 ^ ((codepoint >> 18) & 7);
bytes[1] = 0x80 ^ ((codepoint >> 12) & 0x3f);
bytes[2] = 0x80 ^ ((codepoint >> 6) & 0x3f);
bytes[3] = 0x80 ^ (codepoint & 0x3f);
size = 4;
}
s.write((char *)bytes, size);
}
/// Returns \b true if the data is some kind of complete string.
/// A given character data-type can be used as a hint for the encoding.
/// The string decoding can be cached internally.
/// \param addr is the given address
/// \param charType is the given character data-type
/// \return \b true if the address represents string data
bool StringManager::isString(const Address &addr,Datatype *charType)
{
const uint1 *buffer = (const uint1 *)0;
try {
buffer = getStringData(addr,charType);
}
catch(DataUnavailError &err) {
return false;
}
return (buffer != (const uint1 *)0);
}
/// Write \<stringmanage> tag, with \<string> sub-tags.
/// \param s is the stream to write to
void StringManager::saveXml(ostream &s) const
{
s << "<stringmanage>\n";
map<Address,const uint1 *>::const_iterator iter1;
for(iter1=stringMap.begin();iter1!=stringMap.end();++iter1) {
s << "<string>\n";
(*iter1).first.saveXml(s);
const uint1 *buf = (*iter1).second;
s << " <bytes>\n" << setfill('0');
for(int4 i=0;;++i) {
if (buf[i] == 0) break;
s << hex << setw(2) << (int4)buf[i];
if (i%20 == 19)
s << "\n ";
}
s << "\n </bytes>\n";
}
s << "</stringmanage>\n";
}
/// Read \<stringmanage> tag, with \<string> sub-tags.
/// \param el is the root tag element
/// \param m is the manager for looking up AddressSpaces
void StringManager::restoreXml(const Element *el,const AddrSpaceManager *m)
{
const List &list(el->getChildren());
List::const_iterator iter;
iter = list.begin();
Address addr = Address::restoreXml(*iter, m);
++iter;
vector<uint1> vec;
istringstream is((*iter)->getContent());
int4 val;
char c1, c2;
is >> ws;
c1 = is.get();
c2 = is.get();
while ((c1 > 0) && (c2 > 0)) {
if (c1 <= '9')
c1 = c1 - '0';
else if (c1 <= 'F')
c1 = c1 + 10 - 'A';
else
c1 = c1 + 10 - 'a';
if (c2 <= '9')
c2 = c2 - '0';
else if (c2 <= 'F')
c2 = c2 + 10 - 'A';
else
c2 = c2 + 10 - 'a';
val = c1 * 16 + c2;
vec.push_back((uint1) val);
is >> ws;
c1 = is.get();
c2 = is.get();
}
mapBuffer(addr,vec.data(),vec.size());
}
/// \param buffer is the byte buffer
/// \param size is the number of bytes in the buffer
/// \param charsize is the presumed size (in bytes) of character elements
/// \return \b true if a string terminator is found
bool StringManager::hasCharTerminator(const uint1 *buffer,int4 size,int4 charsize)
{
for(int4 i=0;i<size;i+=charsize) {
bool isTerminator = true;
for(int4 j=0;j<charsize;++j) {
if (buffer[i+j] != 0) { // Non-zero bytes means character can't be a null terminator
isTerminator = false;
break;
}
}
if (isTerminator) return true;
}
return false;
}
/// Pull the first two bytes from the byte array and combine them in the indicated endian order
/// \param buf is the byte array
/// \param bigend is \b true to request big endian encoding
/// \return the decoded UTF16 element
inline int4 StringManager::readUtf16(const uint1 *buf,bool bigend)
{
int4 codepoint;
if (bigend) {
codepoint = buf[0];
codepoint <<= 8;
codepoint += buf[1];
}
else {
codepoint = buf[1];
codepoint <<= 8;
codepoint += buf[0];
}
return codepoint;
}
/// One or more bytes is consumed from the array, and the number of bytes used is passed back.
/// \param buf is a pointer to the bytes in the character array
/// \param charsize is 1 for UTF8, 2 for UTF16, or 4 for UTF32
/// \param bigend is \b true for big endian encoding of the UTF element
/// \param skip is a reference for passing back the number of bytes consumed
/// \return the codepoint or -1 if the encoding is invalid
int4 StringManager::getCodepoint(const uint1 *buf,int4 charsize,bool bigend,int4 &skip)
{
int4 codepoint;
int4 sk = 0;
if (charsize==2) { // UTF-16
codepoint = readUtf16(buf,bigend);
sk += 2;
if ((codepoint>=0xD800)&&(codepoint<=0xDBFF)) { // high surrogate
int4 trail=readUtf16(buf+2,bigend);
sk += 2;
if ((trail<0xDC00)||(trail>0xDFFF)) return -1; // Bad trail
codepoint = (codepoint<<10) + trail + (0x10000 - (0xD800 << 10) - 0xDC00);
}
else if ((codepoint>=0xDC00)&&(codepoint<=0xDFFF)) return -1; // trail before high
}
else if (charsize==1) { // UTF-8
int4 val = buf[0];
if ((val&0x80)==0) {
codepoint = val;
sk = 1;
}
else if ((val&0xe0)==0xc0) {
int4 val2 = buf[1];
sk = 2;
if ((val2&0xc0)!=0x80) return -1; // Not a valid UTF8-encoding
codepoint = ((val&0x1f)<<6) | (val2 & 0x3f);
}
else if ((val&0xf0)==0xe0) {
int4 val2 = buf[1];
int4 val3 = buf[2];
sk = 3;
if (((val2&0xc0)!=0x80)||((val3&0xc0)!=0x80)) return -1; // invalid encoding
codepoint = ((val&0xf)<<12) | ((val2&0x3f)<<6) | (val3 & 0x3f);
}
else if ((val&0xf8)==0xf0) {
int4 val2 = buf[1];
int4 val3 = buf[2];
int4 val4 = buf[3];
sk = 4;
if (((val2&0xc0)!=0x80)||((val3&0xc0)!=0x80)||((val4&0xc0)!=0x80)) return -1; // invalid encoding
codepoint = ((val&7)<<18) | ((val2&0x3f)<<12) | ((val3&0x3f)<<6) | (val4 & 0x3f);
}
else
return -1;
}
else if (charsize == 4) { // UTF-32
sk = 4;
if (bigend)
codepoint = (buf[0]<<24) + (buf[1]<<16) + (buf[2]<<8) + buf[3];
else
codepoint = (buf[3]<<24) + (buf[2]<<16) + (buf[1]<<8) + buf[0];
}
else
return -1;
if (codepoint >= 0xd800 && codepoint <= 0xdfff)
return -1; // Reserved for surrogates, invalid codepoints
skip = sk;
return codepoint;
}
/// \param g is the underlying architecture (and loadimage)
/// \param max is the maximum number of bytes to allow in a decoded string
StringManagerUnicode::StringManagerUnicode(Architecture *g,int4 max)
: StringManager(max)
{
glb = g;
testBuffer = new uint1[max];
}
StringManagerUnicode::~StringManagerUnicode(void)
{
delete [] testBuffer;
}
const uint1 *StringManagerUnicode::getStringData(const Address &addr,Datatype *charType)
{
map<Address,const uint1 *>::iterator iter;
iter = stringMap.find(addr);
if (iter != stringMap.end())
return (*iter).second;
int4 curBufferSize = 0;
int4 charsize = charType->getSize();
bool foundTerminator = false;
do {
int4 amount = 32; // Grab 32 bytes of image at a time
uint4 newBufferSize = curBufferSize + amount;
if (newBufferSize > maximumBytes) {
newBufferSize = maximumBytes;
amount = newBufferSize - curBufferSize;
if (amount == 0) break;
}
glb->loader->loadFill(testBuffer+curBufferSize,amount,addr + curBufferSize);
foundTerminator = hasCharTerminator(testBuffer+curBufferSize,amount,charsize);
curBufferSize = newBufferSize;
} while (!foundTerminator);
const uint1 *resBuffer;
if (charsize == 1) {
if (!isCharacterConstant(testBuffer,curBufferSize,charsize))
return (const uint1 *)0;
resBuffer = mapBuffer(addr,testBuffer,curBufferSize);
}
else {
// We need to translate to UTF8
ostringstream s;
if (!writeUnicode(s, testBuffer, curBufferSize, charsize))
return (const uint1 *)0;
string resString = s.str();
int4 newSize = resString.size();
if (newSize > maximumBytes)
newSize = maximumBytes;
resBuffer = mapBuffer(addr,(const uint1 *)resString.c_str(),newSize);
}
return resBuffer;
}
/// If the string is encoded in UTF8 or ASCII, we get (on average) a bit of check
/// per character. For UTF16, the surrogate reserved area gives at least some check.
/// \param buf is the byte array to check
/// \param size is the size of the buffer in bytes
/// \param charsize is the UTF encoding (1=UTF8, 2=UTF16, 4=UTF32)
/// \return \b true if the buffer is filled with valid unicode
bool StringManagerUnicode::isCharacterConstant(const uint1 *buf,int4 size,int4 charsize) const
{
if (buf == (const uint1 *)0) return false;
bool bigend = glb->translate->isBigEndian();
int4 i=0;
int4 skip = charsize;
while(i<size) {
int4 codepoint = getCodepoint(buf+i,charsize,bigend,skip);
if (codepoint < 0) return false;
if (codepoint == 0) break;
i += skip;
}
return true;
}
/// Assume the buffer contains a null terminated unicode encoded string.
/// Write the characters out (as UTF8) to the stream.
/// \param s is the output stream
/// \param buffer is the given byte buffer
/// \param size is the number of bytes in the buffer
/// \param charsize specifies the encoding (1=UTF8 2=UTF16 4=UTF32)
/// \return \b true if the byte array contains valid unicode
bool StringManagerUnicode::writeUnicode(ostream &s,uint1 *buffer,int4 size,int4 charsize)
{
bool bigend = glb->translate->isBigEndian();
int4 i=0;
int4 skip = charsize;
while(i<size) {
int4 codepoint = getCodepoint(buffer+i,charsize,bigend,skip);
if (codepoint < 0) return false;
if (codepoint == 0) break; // Terminator
writeUtf8(s, codepoint);
i += skip;
}
return true;
}

View file

@ -0,0 +1,80 @@
/* ###
* IP: GHIDRA
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// \file stringmanage.hh
/// \brief Classes for decoding and storing string data
#ifndef __STRING_MANAGE__
#define __STRING_MANAGE__
#include "type.hh"
class Architecture;
/// \brief Storage for decoding and storing strings associated with an address
///
/// Looks at data in the loadimage to determine if it represents a "string".
/// Decodes the string for presentation in the output.
/// Stores the decoded string until its needed for presentation.
class StringManager {
protected:
map<Address,const uint1 *> stringMap; ///< Map from address to string (in UTF8 format)
int4 maximumBytes; ///< Maximum bytes (in UTF8 encoding) allowed
const uint1 *mapBuffer(const Address &addr,const uint1 *buf,int4 size); ///< Move a decoded buffer into storage
public:
StringManager(int4 max); ///< Constructor
virtual ~StringManager(void); ///< Destructor
int4 getMaximumBytes(void) const { return maximumBytes; } ///< Return the maximum bytes allowed in a string decoding
void clear(void); ///< Clear out any cached strings
bool isString(const Address &addr,Datatype *charType); // Determine if data at the given address is a string
/// \brief Retrieve string data at the given address as a UTF8 byte array
///
/// If the address does not represent string data, null is returned. Otherwise,
/// the string data is fetched, converted to a UTF8 encoding, cached and returned.
/// \param addr is the given address
/// \param charType is a character data-type indicating the encoding
/// \return the byte array of UTF8 data (or null)
virtual const uint1 *getStringData(const Address &addr,Datatype *charType)=0;
void saveXml(ostream &s) const; ///< Save cached strings to a stream as XML
void restoreXml(const Element *el,const AddrSpaceManager *m); ///< Restore string cache from XML
static bool hasCharTerminator(const uint1 *buffer,int4 size,int4 charsize); ///< Check for a unicode string terminator
static int4 readUtf16(const uint1 *buf,bool bigend); ///< Read a UTF16 code point from a byte array
static void writeUtf8(ostream &s,int4 codepoint); ///< Write unicode character to stream in UTF8 encoding
static int4 getCodepoint(const uint1 *buf,int4 charsize,bool bigend,int4 &skip); ///< Extract next \e unicode \e codepoint
};
/// \brief An implementation of StringManager that understands terminated unicode strings
///
/// This class understands UTF8, UTF16, and UTF32 encodings. It reports a string if its
/// sees a valid encoding that is null terminated.
class StringManagerUnicode : public StringManager {
Architecture *glb; ///< Underlying architecture
uint1 *testBuffer; ///< Temporary buffer for pulling in loadimage bytes
public:
StringManagerUnicode(Architecture *g,int4 max); ///< Constructor
virtual ~StringManagerUnicode(void);
virtual const uint1 *getStringData(const Address &addr,Datatype *charType);
bool isCharacterConstant(const uint1 *buf,int4 size,int4 charsize) const; ///< Return \b true if buffer looks like unicode
bool writeUnicode(ostream &s,uint1 *buffer,int4 size,int4 charsize); ///< Write unicode byte array to stream (as UTF8)
};
#endif

View file

@ -18,6 +18,7 @@ package ghidra.app.decompiler;
import java.io.IOException;
import java.io.StringReader;
import java.math.BigInteger;
import java.nio.charset.Charset;
import java.util.ArrayList;
import javax.xml.parsers.SAXParser;
@ -27,14 +28,15 @@ import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import ghidra.app.cmd.function.CallDepthChangeInfo;
import ghidra.docking.settings.Settings;
import ghidra.docking.settings.SettingsImpl;
import ghidra.program.disassemble.Disassembler;
import ghidra.program.model.address.*;
import ghidra.program.model.data.DataType;
import ghidra.program.model.data.*;
import ghidra.program.model.lang.*;
import ghidra.program.model.lang.ConstantPool.Record;
import ghidra.program.model.listing.*;
import ghidra.program.model.mem.MemoryAccessException;
import ghidra.program.model.mem.MemoryBlock;
import ghidra.program.model.mem.*;
import ghidra.program.model.pcode.*;
import ghidra.program.model.symbol.*;
import ghidra.util.Msg;
@ -65,6 +67,7 @@ public class DecompileCallback {
private AddressFactory addrfactory;
private ConstantPool cpool;
private PcodeDataTypeManager dtmanage;
private Charset utf8Charset;
private String nativeMessage;
private boolean showNamespace;
@ -84,6 +87,7 @@ public class DecompileCallback {
cpool = null;
nativeMessage = null;
debug = null;
utf8Charset = Charset.availableCharsets().get("UTF-8");
}
private static SAXParser getSAXParser() throws PcodeXMLException {
@ -1177,6 +1181,66 @@ public class DecompileCallback {
return listing.getFunctionAt(addr);
}
public byte[] getStringData(String addrString, String dtName, String dtId) {
Address addr;
int maxBytes;
try {
maxBytes = readXMLSize(addrString);
addr = Varnode.readXMLAddress(addrString, addrfactory, funcEntry.getAddressSpace());
if (addr == Address.NO_ADDRESS) {
throw new PcodeXMLException("Address does not physically map");
}
}
catch (PcodeXMLException e) {
Msg.error(this, "Decompiling " + funcEntry + ": " + e.getMessage());
return null;
}
Data data = program.getListing().getDataContaining(addr);
Settings settings = SettingsImpl.NO_SETTINGS;
AbstractStringDataType dataType = null;
if (data != null) {
settings = data;
if (data.getDataType() instanceof AbstractStringDataType) {
dataType = (AbstractStringDataType) data.getDataType();
}
}
if (dataType == null) {
DataType dt = dtmanage.findBaseType(dtName, dtId);
if (dt instanceof AbstractStringDataType) {
dataType = (AbstractStringDataType) dt;
}
else {
if (dt != null) {
int size = dt.getLength();
if (size == 2) {
dataType = TerminatedUnicodeDataType.dataType;
}
else if (size == 4) {
dataType = TerminatedUnicode32DataType.dataType;
}
else {
dataType = TerminatedStringDataType.dataType;
}
}
else {
dataType = TerminatedStringDataType.dataType;
}
}
}
MemoryBufferImpl buf = new MemoryBufferImpl(program.getMemory(), addr, 64);
Object value = dataType.getValue(buf, settings, maxBytes);
if (!(value instanceof String)) {
return null;
}
String stringVal = (String) value;
byte[] res = stringVal.getBytes(utf8Charset);
if (res.length > maxBytes) {
byte[] trim = new byte[maxBytes];
System.arraycopy(res, 0, trim, 0, maxBytes);
}
return res;
}
//==================================================================================================
// Inner Classes
//==================================================================================================

View file

@ -326,7 +326,12 @@ public class DecompileProcess {
}
break;
case 'S':
if (name.equals("getString")) {
getStringData();
}
else {
getSymbol(); // getSymbol
}
break;
case 'T':
if (name.equals("getType")) {
@ -778,6 +783,31 @@ public class DecompileProcess {
write(query_response_end);
}
private void getStringData() throws IOException {
String addr = readQueryString();
String dtName = readQueryString();
String dtId = readQueryString();
byte[] res = callback.getStringData(addr, dtName, dtId);
write(query_response_start);
if ((res != null) && (res.length > 0)) {
int sz = res.length;
int sz1 = (sz & 0x3f) + 0x20;
sz >>>= 6;
int sz2 = (sz & 0x3f) + 0x20;
write(byte_start);
write(sz1);
write(sz2);
byte[] dblres = new byte[res.length * 2];
for (int i = 0; i < res.length; i++) {
dblres[i * 2] = (byte) (((res[i] >> 4) & 0xf) + 65);
dblres[i * 2 + 1] = (byte) ((res[i] & 0xf) + 65);
}
write(dblres);
write(byte_end);
}
write(query_response_end);
}
private void write(byte[] bytes) throws IOException {
if (nativeOut == null) {
return;