mirror of
https://github.com/NationalSecurityAgency/ghidra.git
synced 2025-10-05 02:39:44 +02:00
Initial commit of new StringManager
This commit is contained in:
parent
6a15520aa5
commit
0eb48e441f
19 changed files with 740 additions and 236 deletions
391
Ghidra/Features/Decompiler/src/decompile/cpp/stringmanage.cc
Normal file
391
Ghidra/Features/Decompiler/src/decompile/cpp/stringmanage.cc
Normal file
|
@ -0,0 +1,391 @@
|
|||
/* ###
|
||||
* IP: GHIDRA
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "stringmanage.hh"
|
||||
#include "architecture.hh"
|
||||
|
||||
/// Before calling, we must check that there is no other buffer stored at the address.
|
||||
/// \param addr is the Address to store the buffer at
|
||||
/// \param buf is the buffer to be copied into storage
|
||||
/// \param size is the number of bytes in the buffer
|
||||
/// \return the new permanent copy of the buffer
|
||||
const uint1 *StringManager::mapBuffer(const Address &addr,const uint1 *buf,int4 size)
|
||||
|
||||
{
|
||||
uint1 *storeBuf = new uint1[size + 1];
|
||||
stringMap[addr] = storeBuf;
|
||||
memcpy(storeBuf,buf,size);
|
||||
storeBuf[size] = 0;
|
||||
return storeBuf;
|
||||
}
|
||||
|
||||
/// \param max is the maximum number of bytes to allow in a decoded string
|
||||
StringManager::StringManager(int4 max)
|
||||
|
||||
{
|
||||
maximumBytes = max;
|
||||
}
|
||||
|
||||
StringManager::~StringManager(void)
|
||||
|
||||
{
|
||||
clear();
|
||||
}
|
||||
|
||||
void StringManager::clear(void)
|
||||
|
||||
{
|
||||
map<Address,const uint1 *>::iterator iter;
|
||||
|
||||
for(iter=stringMap.begin();iter!=stringMap.end();++iter) {
|
||||
delete [] (*iter).second;
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode the given unicode codepoint as UTF8 (1, 2, 3, or 4 bytes) and
|
||||
/// write the bytes to the stream.
|
||||
/// \param s is the output stream
|
||||
/// \param codepoint is the unicode codepoint
|
||||
void StringManager::writeUtf8(ostream &s,int4 codepoint)
|
||||
|
||||
{
|
||||
uint1 bytes[4];
|
||||
int4 size;
|
||||
|
||||
if (codepoint < 0)
|
||||
throw LowlevelError("Negative unicode codepoint");
|
||||
if (codepoint < 128) {
|
||||
s.put((uint1)codepoint);
|
||||
return;
|
||||
}
|
||||
int4 bits = mostsigbit_set(codepoint) + 1;
|
||||
if (bits > 21)
|
||||
throw LowlevelError("Bad unicode codepoint");
|
||||
if (bits < 12) { // Encode with two bytes
|
||||
bytes[0] = 0xc0 ^ ((codepoint >> 6)&0x1f);
|
||||
bytes[1] = 0x80 ^ (codepoint & 0x3f);
|
||||
size = 2;
|
||||
}
|
||||
else if (bits < 17) {
|
||||
bytes[0] = 0xe0 ^ ((codepoint >> 12)&0xf);
|
||||
bytes[1] = 0x80 ^ ((codepoint >> 6)&0x3f);
|
||||
bytes[2] = 0x80 ^ (codepoint & 0x3f);
|
||||
size = 3;
|
||||
}
|
||||
else {
|
||||
bytes[0] = 0xf0 ^ ((codepoint >> 18) & 7);
|
||||
bytes[1] = 0x80 ^ ((codepoint >> 12) & 0x3f);
|
||||
bytes[2] = 0x80 ^ ((codepoint >> 6) & 0x3f);
|
||||
bytes[3] = 0x80 ^ (codepoint & 0x3f);
|
||||
size = 4;
|
||||
}
|
||||
s.write((char *)bytes, size);
|
||||
}
|
||||
|
||||
/// Returns \b true if the data is some kind of complete string.
|
||||
/// A given character data-type can be used as a hint for the encoding.
|
||||
/// The string decoding can be cached internally.
|
||||
/// \param addr is the given address
|
||||
/// \param charType is the given character data-type
|
||||
/// \return \b true if the address represents string data
|
||||
bool StringManager::isString(const Address &addr,Datatype *charType)
|
||||
|
||||
{
|
||||
const uint1 *buffer = (const uint1 *)0;
|
||||
try {
|
||||
buffer = getStringData(addr,charType);
|
||||
}
|
||||
catch(DataUnavailError &err) {
|
||||
return false;
|
||||
}
|
||||
return (buffer != (const uint1 *)0);
|
||||
}
|
||||
|
||||
/// Write \<stringmanage> tag, with \<string> sub-tags.
|
||||
/// \param s is the stream to write to
|
||||
void StringManager::saveXml(ostream &s) const
|
||||
|
||||
{
|
||||
s << "<stringmanage>\n";
|
||||
|
||||
map<Address,const uint1 *>::const_iterator iter1;
|
||||
for(iter1=stringMap.begin();iter1!=stringMap.end();++iter1) {
|
||||
s << "<string>\n";
|
||||
(*iter1).first.saveXml(s);
|
||||
const uint1 *buf = (*iter1).second;
|
||||
s << " <bytes>\n" << setfill('0');
|
||||
for(int4 i=0;;++i) {
|
||||
if (buf[i] == 0) break;
|
||||
s << hex << setw(2) << (int4)buf[i];
|
||||
if (i%20 == 19)
|
||||
s << "\n ";
|
||||
}
|
||||
s << "\n </bytes>\n";
|
||||
}
|
||||
s << "</stringmanage>\n";
|
||||
}
|
||||
|
||||
/// Read \<stringmanage> tag, with \<string> sub-tags.
|
||||
/// \param el is the root tag element
|
||||
/// \param m is the manager for looking up AddressSpaces
|
||||
void StringManager::restoreXml(const Element *el,const AddrSpaceManager *m)
|
||||
|
||||
{
|
||||
const List &list(el->getChildren());
|
||||
List::const_iterator iter;
|
||||
iter = list.begin();
|
||||
Address addr = Address::restoreXml(*iter, m);
|
||||
++iter;
|
||||
vector<uint1> vec;
|
||||
istringstream is((*iter)->getContent());
|
||||
int4 val;
|
||||
char c1, c2;
|
||||
is >> ws;
|
||||
c1 = is.get();
|
||||
c2 = is.get();
|
||||
while ((c1 > 0) && (c2 > 0)) {
|
||||
if (c1 <= '9')
|
||||
c1 = c1 - '0';
|
||||
else if (c1 <= 'F')
|
||||
c1 = c1 + 10 - 'A';
|
||||
else
|
||||
c1 = c1 + 10 - 'a';
|
||||
if (c2 <= '9')
|
||||
c2 = c2 - '0';
|
||||
else if (c2 <= 'F')
|
||||
c2 = c2 + 10 - 'A';
|
||||
else
|
||||
c2 = c2 + 10 - 'a';
|
||||
val = c1 * 16 + c2;
|
||||
vec.push_back((uint1) val);
|
||||
is >> ws;
|
||||
c1 = is.get();
|
||||
c2 = is.get();
|
||||
}
|
||||
mapBuffer(addr,vec.data(),vec.size());
|
||||
}
|
||||
|
||||
/// \param buffer is the byte buffer
|
||||
/// \param size is the number of bytes in the buffer
|
||||
/// \param charsize is the presumed size (in bytes) of character elements
|
||||
/// \return \b true if a string terminator is found
|
||||
bool StringManager::hasCharTerminator(const uint1 *buffer,int4 size,int4 charsize)
|
||||
|
||||
{
|
||||
for(int4 i=0;i<size;i+=charsize) {
|
||||
bool isTerminator = true;
|
||||
for(int4 j=0;j<charsize;++j) {
|
||||
if (buffer[i+j] != 0) { // Non-zero bytes means character can't be a null terminator
|
||||
isTerminator = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (isTerminator) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Pull the first two bytes from the byte array and combine them in the indicated endian order
|
||||
/// \param buf is the byte array
|
||||
/// \param bigend is \b true to request big endian encoding
|
||||
/// \return the decoded UTF16 element
|
||||
inline int4 StringManager::readUtf16(const uint1 *buf,bool bigend)
|
||||
|
||||
{
|
||||
int4 codepoint;
|
||||
if (bigend) {
|
||||
codepoint = buf[0];
|
||||
codepoint <<= 8;
|
||||
codepoint += buf[1];
|
||||
}
|
||||
else {
|
||||
codepoint = buf[1];
|
||||
codepoint <<= 8;
|
||||
codepoint += buf[0];
|
||||
}
|
||||
return codepoint;
|
||||
}
|
||||
|
||||
/// One or more bytes is consumed from the array, and the number of bytes used is passed back.
|
||||
/// \param buf is a pointer to the bytes in the character array
|
||||
/// \param charsize is 1 for UTF8, 2 for UTF16, or 4 for UTF32
|
||||
/// \param bigend is \b true for big endian encoding of the UTF element
|
||||
/// \param skip is a reference for passing back the number of bytes consumed
|
||||
/// \return the codepoint or -1 if the encoding is invalid
|
||||
int4 StringManager::getCodepoint(const uint1 *buf,int4 charsize,bool bigend,int4 &skip)
|
||||
|
||||
{
|
||||
int4 codepoint;
|
||||
int4 sk = 0;
|
||||
if (charsize==2) { // UTF-16
|
||||
codepoint = readUtf16(buf,bigend);
|
||||
sk += 2;
|
||||
if ((codepoint>=0xD800)&&(codepoint<=0xDBFF)) { // high surrogate
|
||||
int4 trail=readUtf16(buf+2,bigend);
|
||||
sk += 2;
|
||||
if ((trail<0xDC00)||(trail>0xDFFF)) return -1; // Bad trail
|
||||
codepoint = (codepoint<<10) + trail + (0x10000 - (0xD800 << 10) - 0xDC00);
|
||||
}
|
||||
else if ((codepoint>=0xDC00)&&(codepoint<=0xDFFF)) return -1; // trail before high
|
||||
}
|
||||
else if (charsize==1) { // UTF-8
|
||||
int4 val = buf[0];
|
||||
if ((val&0x80)==0) {
|
||||
codepoint = val;
|
||||
sk = 1;
|
||||
}
|
||||
else if ((val&0xe0)==0xc0) {
|
||||
int4 val2 = buf[1];
|
||||
sk = 2;
|
||||
if ((val2&0xc0)!=0x80) return -1; // Not a valid UTF8-encoding
|
||||
codepoint = ((val&0x1f)<<6) | (val2 & 0x3f);
|
||||
}
|
||||
else if ((val&0xf0)==0xe0) {
|
||||
int4 val2 = buf[1];
|
||||
int4 val3 = buf[2];
|
||||
sk = 3;
|
||||
if (((val2&0xc0)!=0x80)||((val3&0xc0)!=0x80)) return -1; // invalid encoding
|
||||
codepoint = ((val&0xf)<<12) | ((val2&0x3f)<<6) | (val3 & 0x3f);
|
||||
}
|
||||
else if ((val&0xf8)==0xf0) {
|
||||
int4 val2 = buf[1];
|
||||
int4 val3 = buf[2];
|
||||
int4 val4 = buf[3];
|
||||
sk = 4;
|
||||
if (((val2&0xc0)!=0x80)||((val3&0xc0)!=0x80)||((val4&0xc0)!=0x80)) return -1; // invalid encoding
|
||||
codepoint = ((val&7)<<18) | ((val2&0x3f)<<12) | ((val3&0x3f)<<6) | (val4 & 0x3f);
|
||||
}
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
else if (charsize == 4) { // UTF-32
|
||||
sk = 4;
|
||||
if (bigend)
|
||||
codepoint = (buf[0]<<24) + (buf[1]<<16) + (buf[2]<<8) + buf[3];
|
||||
else
|
||||
codepoint = (buf[3]<<24) + (buf[2]<<16) + (buf[1]<<8) + buf[0];
|
||||
}
|
||||
else
|
||||
return -1;
|
||||
if (codepoint >= 0xd800 && codepoint <= 0xdfff)
|
||||
return -1; // Reserved for surrogates, invalid codepoints
|
||||
skip = sk;
|
||||
return codepoint;
|
||||
}
|
||||
|
||||
/// \param g is the underlying architecture (and loadimage)
|
||||
/// \param max is the maximum number of bytes to allow in a decoded string
|
||||
StringManagerUnicode::StringManagerUnicode(Architecture *g,int4 max)
|
||||
: StringManager(max)
|
||||
{
|
||||
glb = g;
|
||||
testBuffer = new uint1[max];
|
||||
}
|
||||
|
||||
StringManagerUnicode::~StringManagerUnicode(void)
|
||||
|
||||
{
|
||||
delete [] testBuffer;
|
||||
}
|
||||
|
||||
const uint1 *StringManagerUnicode::getStringData(const Address &addr,Datatype *charType)
|
||||
|
||||
{
|
||||
map<Address,const uint1 *>::iterator iter;
|
||||
iter = stringMap.find(addr);
|
||||
if (iter != stringMap.end())
|
||||
return (*iter).second;
|
||||
|
||||
int4 curBufferSize = 0;
|
||||
int4 charsize = charType->getSize();
|
||||
bool foundTerminator = false;
|
||||
|
||||
do {
|
||||
int4 amount = 32; // Grab 32 bytes of image at a time
|
||||
uint4 newBufferSize = curBufferSize + amount;
|
||||
if (newBufferSize > maximumBytes) {
|
||||
newBufferSize = maximumBytes;
|
||||
amount = newBufferSize - curBufferSize;
|
||||
if (amount == 0) break;
|
||||
}
|
||||
glb->loader->loadFill(testBuffer+curBufferSize,amount,addr + curBufferSize);
|
||||
foundTerminator = hasCharTerminator(testBuffer+curBufferSize,amount,charsize);
|
||||
curBufferSize = newBufferSize;
|
||||
} while (!foundTerminator);
|
||||
|
||||
const uint1 *resBuffer;
|
||||
if (charsize == 1) {
|
||||
if (!isCharacterConstant(testBuffer,curBufferSize,charsize))
|
||||
return (const uint1 *)0;
|
||||
resBuffer = mapBuffer(addr,testBuffer,curBufferSize);
|
||||
}
|
||||
else {
|
||||
// We need to translate to UTF8
|
||||
ostringstream s;
|
||||
if (!writeUnicode(s, testBuffer, curBufferSize, charsize))
|
||||
return (const uint1 *)0;
|
||||
string resString = s.str();
|
||||
int4 newSize = resString.size();
|
||||
if (newSize > maximumBytes)
|
||||
newSize = maximumBytes;
|
||||
resBuffer = mapBuffer(addr,(const uint1 *)resString.c_str(),newSize);
|
||||
}
|
||||
return resBuffer;
|
||||
}
|
||||
|
||||
/// If the string is encoded in UTF8 or ASCII, we get (on average) a bit of check
|
||||
/// per character. For UTF16, the surrogate reserved area gives at least some check.
|
||||
/// \param buf is the byte array to check
|
||||
/// \param size is the size of the buffer in bytes
|
||||
/// \param charsize is the UTF encoding (1=UTF8, 2=UTF16, 4=UTF32)
|
||||
/// \return \b true if the buffer is filled with valid unicode
|
||||
bool StringManagerUnicode::isCharacterConstant(const uint1 *buf,int4 size,int4 charsize) const
|
||||
|
||||
{
|
||||
if (buf == (const uint1 *)0) return false;
|
||||
bool bigend = glb->translate->isBigEndian();
|
||||
int4 i=0;
|
||||
int4 skip = charsize;
|
||||
while(i<size) {
|
||||
int4 codepoint = getCodepoint(buf+i,charsize,bigend,skip);
|
||||
if (codepoint < 0) return false;
|
||||
if (codepoint == 0) break;
|
||||
i += skip;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Assume the buffer contains a null terminated unicode encoded string.
|
||||
/// Write the characters out (as UTF8) to the stream.
|
||||
/// \param s is the output stream
|
||||
/// \param buffer is the given byte buffer
|
||||
/// \param size is the number of bytes in the buffer
|
||||
/// \param charsize specifies the encoding (1=UTF8 2=UTF16 4=UTF32)
|
||||
/// \return \b true if the byte array contains valid unicode
|
||||
bool StringManagerUnicode::writeUnicode(ostream &s,uint1 *buffer,int4 size,int4 charsize)
|
||||
|
||||
{
|
||||
bool bigend = glb->translate->isBigEndian();
|
||||
int4 i=0;
|
||||
int4 skip = charsize;
|
||||
while(i<size) {
|
||||
int4 codepoint = getCodepoint(buffer+i,charsize,bigend,skip);
|
||||
if (codepoint < 0) return false;
|
||||
if (codepoint == 0) break; // Terminator
|
||||
writeUtf8(s, codepoint);
|
||||
i += skip;
|
||||
}
|
||||
return true;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue