GP-3307 Stack strings

This commit is contained in:
caheckman 2024-04-16 22:24:34 +00:00
parent 9b6ba66aa0
commit 5604178194
37 changed files with 1653 additions and 419 deletions

View file

@ -15,6 +15,7 @@
*/
#include "stringmanage.hh"
#include "architecture.hh"
#include "crc32.hh"
namespace ghidra {
@ -24,6 +25,85 @@ ElementId ELEM_BYTES = ElementId("bytes",83);
ElementId ELEM_STRING = ElementId("string",84);
ElementId ELEM_STRINGMANAGE = ElementId("stringmanage",85);
/// Assume the buffer contains a null terminated unicode encoded string.
/// Write the characters out (as UTF8) to the stream.
/// \param s is the output stream
/// \param buffer is the given byte buffer
/// \param size is the number of bytes in the buffer
/// \param charsize specifies the encoding (1=UTF8 2=UTF16 4=UTF32)
/// \param bigend is \b true if (UTF16 and UTF32) are big endian encoded
/// \return \b true if the byte array contains valid unicode
bool StringManager::writeUnicode(ostream &s,const uint1 *buffer,int4 size,int4 charsize,bool bigend)
{
int4 i=0;
int4 count=0;
int4 skip = charsize;
while(i<size) {
int4 codepoint = getCodepoint(buffer+i,charsize,bigend,skip);
if (codepoint < 0) return false;
if (codepoint == 0) break; // Terminator
writeUtf8(s, codepoint);
i += skip;
count += 1;
if (count >= maximumChars)
break;
}
return true;
}
/// \brief Translate and assign raw string data to a StringData object
///
/// The string data is provided as raw bytes. The data is translated to UTF-8 and truncated
/// to the \b maximumChars allowed by the manager. The encoding must be legal unicode as performed
/// by checkCharacters().
/// \param data is the StringData object to populate
/// \param buf is the raw byte array
/// \param size is the number of bytes in the array
/// \param charsize is the size of unicode encoding
/// \param numChars is the number of characters in the encoding as returned by checkCharacters()
/// \param bigend is \b true if UTF-16 and UTF-32 elements are big endian encoded
void StringManager::assignStringData(StringData &data,const uint1 *buf,int4 size,int4 charsize,int4 numChars,bool bigend)
{
if (charsize == 1 && numChars < maximumChars) {
data.byteData.reserve(size);
data.byteData.assign(buf,buf+size);
}
else {
// We need to translate to UTF8 and/or truncate
ostringstream s;
if (!writeUnicode(s, buf, size, charsize, bigend))
return;
string resString = s.str();
int4 newSize = resString.size();
data.byteData.reserve(newSize + 1);
const uint1 *ptr = (const uint1 *)resString.c_str();
data.byteData.assign(ptr,ptr+newSize);
data.byteData[newSize] = 0; // Make sure there is a null terminator
}
data.isTruncated = (numChars >= maximumChars);
}
/// \brief Calculate hash of a specific Address and contents of a byte array
///
/// Calculate a 32-bit CRC of the bytes and XOR into the upper part of the Address offset.
/// \param addr is the specific Address
/// \param buf is a pointer to the array of bytes
/// \param size is the number of bytes in the array
/// \return the 64-bit hash
uint8 StringManager::calcInternalHash(const Address &addr,const uint1 *buf,int4 size)
{
uint4 reg = 0x7b7c66a9;
for(int4 i=0;i<size;++i) {
reg = crc_update(reg, buf[i]);
}
uint8 res = addr.getOffset();
res ^= ((uint8)reg) << 32;
return res;
}
/// \param max is the maximum number of characters to allow before truncating string
StringManager::StringManager(int4 max)
@ -91,6 +171,33 @@ bool StringManager::isString(const Address &addr,Datatype *charType)
return !buffer.empty();
}
/// \brief Associate string data at a code address or other location that doesn't hold string data normally
///
/// The given byte buffer is decoded, and if it represents a legal string, a non-zero hash is returned,
/// constructed from an Address associated with the string and the string data itself. The registered string
/// can be retrieved via the getStringData() method using this hash as a constant Address. If the string is not
/// legal, 0 is returned.
/// \param addr is the address to associate with the string data
/// \param buf is a pointer to the array of raw bytes encoding the string
/// \param size is the number of bytes in the array
/// \param charType is a character data-type indicating the encoding
/// \return a hash associated with the string or 0
uint8 StringManager::registerInternalStringData(const Address &addr,const uint1 *buf,int4 size,Datatype *charType)
{
int4 charsize = charType->getSize();
int4 numChars = checkCharacters(buf, size, charsize, addr.isBigEndian());
if (numChars < 0)
return 0; // Not a legal encoding
uint8 hash = calcInternalHash(addr, buf, size);
Address constAddr = addr.getSpace()->getManager()->getConstant(hash);
StringData &stringData( stringMap[constAddr] );
stringData.byteData.clear();
stringData.isTruncated = false;
assignStringData(stringData, buf, size, charsize, numChars, addr.isBigEndian());
return hash;
}
/// Encode \<stringmanage> element, with \<string> children.
/// \param encoder is the stream encoder
void StringManager::encode(Encoder &encoder) const
@ -204,6 +311,33 @@ inline int4 StringManager::readUtf16(const uint1 *buf,bool bigend)
return codepoint;
}
/// \brief Make sure buffer has valid bounded set of unicode
///
/// Check that the given buffer contains valid unicode.
/// If the string is encoded in UTF8 or ASCII, we get (on average) a bit of check
/// per character. For UTF16, the surrogate reserved area gives at least some check.
/// \param buf is the byte array to check
/// \param size is the size of the buffer in bytes
/// \param charsize is the UTF encoding (1=UTF8, 2=UTF16, 4=UTF32)
/// \param bigend is \b true if the (UTF16 and UTF32) characters are big endian encoded
/// \return the number of characters or -1 if there is an invalid encoding
int4 StringManager::checkCharacters(const uint1 *buf,int4 size,int4 charsize,bool bigend)
{
if (buf == (const uint1 *)0) return -1;
int4 i=0;
int4 count=0;
int4 skip = charsize;
while(i<size) {
int4 codepoint = getCodepoint(buf+i,charsize,bigend,skip);
if (codepoint < 0) return -1;
if (codepoint == 0) break;
count += 1;
i += skip;
}
return count;
}
/// One or more bytes is consumed from the array, and the number of bytes used is passed back.
/// \param buf is a pointer to the bytes in the character array
/// \param charsize is 1 for UTF8, 2 for UTF16, or 4 for UTF32
@ -328,80 +462,12 @@ const vector<uint1> &StringManagerUnicode::getStringData(const Address &addr,Dat
return stringData.byteData; // Return the empty buffer
}
int4 numChars = checkCharacters(testBuffer, curBufferSize, charsize);
int4 numChars = checkCharacters(testBuffer, curBufferSize, charsize, addr.isBigEndian());
if (numChars < 0)
return stringData.byteData; // Return the empty buffer (invalid encoding)
if (charsize == 1 && numChars < maximumChars) {
stringData.byteData.reserve(curBufferSize);
stringData.byteData.assign(testBuffer,testBuffer+curBufferSize);
}
else {
// We need to translate to UTF8 and/or truncate
ostringstream s;
if (!writeUnicode(s, testBuffer, curBufferSize, charsize))
return stringData.byteData; // Return the empty buffer
string resString = s.str();
int4 newSize = resString.size();
stringData.byteData.reserve(newSize + 1);
const uint1 *ptr = (const uint1 *)resString.c_str();
stringData.byteData.assign(ptr,ptr+newSize);
stringData.byteData[newSize] = 0; // Make sure there is a null terminator
}
stringData.isTruncated = (numChars >= maximumChars);
assignStringData(stringData, testBuffer, curBufferSize, charsize, numChars, addr.isBigEndian());
isTrunc = stringData.isTruncated;
return stringData.byteData;
}
/// Check that the given buffer contains valid unicode.
/// If the string is encoded in UTF8 or ASCII, we get (on average) a bit of check
/// per character. For UTF16, the surrogate reserved area gives at least some check.
/// \param buf is the byte array to check
/// \param size is the size of the buffer in bytes
/// \param charsize is the UTF encoding (1=UTF8, 2=UTF16, 4=UTF32)
/// \return the number of characters or -1 if there is an invalid encoding
int4 StringManagerUnicode::checkCharacters(const uint1 *buf,int4 size,int4 charsize) const
{
if (buf == (const uint1 *)0) return -1;
bool bigend = glb->translate->isBigEndian();
int4 i=0;
int4 count=0;
int4 skip = charsize;
while(i<size) {
int4 codepoint = getCodepoint(buf+i,charsize,bigend,skip);
if (codepoint < 0) return -1;
if (codepoint == 0) break;
count += 1;
i += skip;
}
return count;
}
/// Assume the buffer contains a null terminated unicode encoded string.
/// Write the characters out (as UTF8) to the stream.
/// \param s is the output stream
/// \param buffer is the given byte buffer
/// \param size is the number of bytes in the buffer
/// \param charsize specifies the encoding (1=UTF8 2=UTF16 4=UTF32)
/// \return \b true if the byte array contains valid unicode
bool StringManagerUnicode::writeUnicode(ostream &s,uint1 *buffer,int4 size,int4 charsize)
{
bool bigend = glb->translate->isBigEndian();
int4 i=0;
int4 count=0;
int4 skip = charsize;
while(i<size) {
int4 codepoint = getCodepoint(buffer+i,charsize,bigend,skip);
if (codepoint < 0) return false;
if (codepoint == 0) break; // Terminator
writeUtf8(s, codepoint);
i += skip;
count += 1;
if (count >= maximumChars)
break;
}
return true;
}
} // End namespace ghidra