GP-3307 Stack strings

2025-10-04 18:29:37 +02:00 · 2024-04-16 22:24:34 +00:00 · 2024-04-16 22:24:34 +00:00 · 5604178194
commit 5604178194
parent 9b6ba66aa0
37 changed files with 1653 additions and 419 deletions
--- a/Ghidra/Features/Decompiler/src/decompile/cpp/stringmanage.cc
+++ b/Ghidra/Features/Decompiler/src/decompile/cpp/stringmanage.cc
@ -15,6 +15,7 @@
 */
 #include "stringmanage.hh"
 #include "architecture.hh"
+#include "crc32.hh"

 namespace ghidra {

@ -24,6 +25,85 @@ ElementId ELEM_BYTES = ElementId("bytes",83);
 ElementId ELEM_STRING = ElementId("string",84);
 ElementId ELEM_STRINGMANAGE = ElementId("stringmanage",85);

+/// Assume the buffer contains a null terminated unicode encoded string.
+/// Write the characters out (as UTF8) to the stream.
+/// \param s is the output stream
+/// \param buffer is the given byte buffer
+/// \param size is the number of bytes in the buffer
+/// \param charsize specifies the encoding (1=UTF8 2=UTF16 4=UTF32)
+/// \param bigend is \b true if (UTF16 and UTF32) are big endian encoded
+/// \return \b true if the byte array contains valid unicode
+bool StringManager::writeUnicode(ostream &s,const uint1 *buffer,int4 size,int4 charsize,bool bigend)
+
+{
+  int4 i=0;
+  int4 count=0;
+  int4 skip = charsize;
+  while(i<size) {
+    int4 codepoint = getCodepoint(buffer+i,charsize,bigend,skip);
+    if (codepoint < 0) return false;
+    if (codepoint == 0) break;		// Terminator
+    writeUtf8(s, codepoint);
+    i += skip;
+    count += 1;
+    if (count >= maximumChars)
+      break;
+  }
+  return true;
+}
+
+/// \brief Translate and assign raw string data to a StringData object
+///
+/// The string data is provided as raw bytes.  The data is translated to UTF-8 and truncated
+/// to the \b maximumChars allowed by the manager.  The encoding must be legal unicode as performed
+/// by checkCharacters().
+/// \param data is the StringData object to populate
+/// \param buf is the raw byte array
+/// \param size is the number of bytes in the array
+/// \param charsize is the size of unicode encoding
+/// \param numChars is the number of characters in the encoding as returned by checkCharacters()
+/// \param bigend is \b true if UTF-16 and UTF-32 elements are big endian encoded
+void StringManager::assignStringData(StringData &data,const uint1 *buf,int4 size,int4 charsize,int4 numChars,bool bigend)
+
+{
+  if (charsize == 1 && numChars < maximumChars) {
+    data.byteData.reserve(size);
+    data.byteData.assign(buf,buf+size);
+  }
+  else {
+    // We need to translate to UTF8 and/or truncate
+    ostringstream s;
+    if (!writeUnicode(s, buf, size, charsize, bigend))
+      return;
+    string resString = s.str();
+    int4 newSize = resString.size();
+    data.byteData.reserve(newSize + 1);
+    const uint1 *ptr = (const uint1 *)resString.c_str();
+    data.byteData.assign(ptr,ptr+newSize);
+    data.byteData[newSize] = 0;		// Make sure there is a null terminator
+  }
+  data.isTruncated = (numChars >= maximumChars);
+}
+
+/// \brief Calculate hash of a specific Address and contents of a byte array
+///
+/// Calculate a 32-bit CRC of the bytes and XOR into the upper part of the Address offset.
+/// \param addr is the specific Address
+/// \param buf is a pointer to the array of bytes
+/// \param size is the number of bytes in the array
+/// \return the 64-bit hash
+uint8 StringManager::calcInternalHash(const Address &addr,const uint1 *buf,int4 size)
+
+{
+  uint4 reg = 0x7b7c66a9;
+  for(int4 i=0;i<size;++i) {
+    reg = crc_update(reg, buf[i]);
+  }
+  uint8 res = addr.getOffset();
+  res ^= ((uint8)reg) << 32;
+  return res;
+}
+
 /// \param max is the maximum number of characters to allow before truncating string
 StringManager::StringManager(int4 max)

@ -91,6 +171,33 @@ bool StringManager::isString(const Address &addr,Datatype *charType)
  return !buffer.empty();
 }

+/// \brief Associate string data at a code address or other location that doesn't hold string data normally
+///
+/// The given byte buffer is decoded, and if it represents a legal string, a non-zero hash is returned,
+/// constructed from an Address associated with the string and the string data itself. The registered string
+/// can be retrieved via the getStringData() method using this hash as a constant Address.  If the string is not
+/// legal, 0 is returned.
+/// \param addr is the address to associate with the string data
+/// \param buf is a pointer to the array of raw bytes encoding the string
+/// \param size is the number of bytes in the array
+/// \param charType is a character data-type indicating the encoding
+/// \return a hash associated with the string or 0
+uint8 StringManager::registerInternalStringData(const Address &addr,const uint1 *buf,int4 size,Datatype *charType)
+
+{
+  int4 charsize = charType->getSize();
+  int4 numChars = checkCharacters(buf, size, charsize, addr.isBigEndian());
+  if (numChars < 0)
+    return 0;	// Not a legal encoding
+  uint8 hash = calcInternalHash(addr, buf, size);
+  Address constAddr = addr.getSpace()->getManager()->getConstant(hash);
+  StringData &stringData( stringMap[constAddr] );
+  stringData.byteData.clear();
+  stringData.isTruncated = false;
+  assignStringData(stringData, buf, size, charsize, numChars, addr.isBigEndian());
+  return hash;
+}
+
 /// Encode \<stringmanage> element, with \<string> children.
 /// \param encoder is the stream encoder
 void StringManager::encode(Encoder &encoder) const
@ -204,6 +311,33 @@ inline int4 StringManager::readUtf16(const uint1 *buf,bool bigend)
  return codepoint;
 }

+/// \brief Make sure buffer has valid bounded set of unicode
+///
+/// Check that the given buffer contains valid unicode.
+/// If the string is encoded in UTF8 or ASCII, we get (on average) a bit of check
+/// per character.  For UTF16, the surrogate reserved area gives at least some check.
+/// \param buf is the byte array to check
+/// \param size is the size of the buffer in bytes
+/// \param charsize is the UTF encoding (1=UTF8, 2=UTF16, 4=UTF32)
+/// \param bigend is \b true if the (UTF16 and UTF32) characters are big endian encoded
+/// \return the number of characters or -1 if there is an invalid encoding
+int4 StringManager::checkCharacters(const uint1 *buf,int4 size,int4 charsize,bool bigend)
+
+{
+  if (buf == (const uint1 *)0) return -1;
+  int4 i=0;
+  int4 count=0;
+  int4 skip = charsize;
+  while(i<size) {
+    int4 codepoint = getCodepoint(buf+i,charsize,bigend,skip);
+    if (codepoint < 0) return -1;
+    if (codepoint == 0) break;
+    count += 1;
+    i += skip;
+  }
+  return count;
+}
+
 /// One or more bytes is consumed from the array, and the number of bytes used is passed back.
 /// \param buf is a pointer to the bytes in the character array
 /// \param charsize is 1 for UTF8, 2 for UTF16, or 4 for UTF32
@ -328,80 +462,12 @@ const vector<uint1> &StringManagerUnicode::getStringData(const Address &addr,Dat
    return stringData.byteData;			// Return the empty buffer
  }

-  int4 numChars = checkCharacters(testBuffer, curBufferSize, charsize);
+  int4 numChars = checkCharacters(testBuffer, curBufferSize, charsize, addr.isBigEndian());
  if (numChars < 0)
    return stringData.byteData;		// Return the empty buffer (invalid encoding)
-  if (charsize == 1 && numChars < maximumChars) {
-    stringData.byteData.reserve(curBufferSize);
-    stringData.byteData.assign(testBuffer,testBuffer+curBufferSize);
-  }
-  else {
-    // We need to translate to UTF8 and/or truncate
-    ostringstream s;
-    if (!writeUnicode(s, testBuffer, curBufferSize, charsize))
-      return stringData.byteData;		// Return the empty buffer
-    string resString = s.str();
-    int4 newSize = resString.size();
-    stringData.byteData.reserve(newSize + 1);
-    const uint1 *ptr = (const uint1 *)resString.c_str();
-    stringData.byteData.assign(ptr,ptr+newSize);
-    stringData.byteData[newSize] = 0;		// Make sure there is a null terminator
-  }
-  stringData.isTruncated = (numChars >= maximumChars);
+  assignStringData(stringData, testBuffer, curBufferSize, charsize, numChars, addr.isBigEndian());
  isTrunc = stringData.isTruncated;
  return stringData.byteData;
 }

-/// Check that the given buffer contains valid unicode.
-/// If the string is encoded in UTF8 or ASCII, we get (on average) a bit of check
-/// per character.  For UTF16, the surrogate reserved area gives at least some check.
-/// \param buf is the byte array to check
-/// \param size is the size of the buffer in bytes
-/// \param charsize is the UTF encoding (1=UTF8, 2=UTF16, 4=UTF32)
-/// \return the number of characters or -1 if there is an invalid encoding
-int4 StringManagerUnicode::checkCharacters(const uint1 *buf,int4 size,int4 charsize) const
-
-{
-  if (buf == (const uint1 *)0) return -1;
-  bool bigend = glb->translate->isBigEndian();
-  int4 i=0;
-  int4 count=0;
-  int4 skip = charsize;
-  while(i<size) {
-    int4 codepoint = getCodepoint(buf+i,charsize,bigend,skip);
-    if (codepoint < 0) return -1;
-    if (codepoint == 0) break;
-    count += 1;
-    i += skip;
-  }
-  return count;
-}
-
-/// Assume the buffer contains a null terminated unicode encoded string.
-/// Write the characters out (as UTF8) to the stream.
-/// \param s is the output stream
-/// \param buffer is the given byte buffer
-/// \param size is the number of bytes in the buffer
-/// \param charsize specifies the encoding (1=UTF8 2=UTF16 4=UTF32)
-/// \return \b true if the byte array contains valid unicode
-bool StringManagerUnicode::writeUnicode(ostream &s,uint1 *buffer,int4 size,int4 charsize)
-
-{
-  bool bigend = glb->translate->isBigEndian();
-  int4 i=0;
-  int4 count=0;
-  int4 skip = charsize;
-  while(i<size) {
-    int4 codepoint = getCodepoint(buffer+i,charsize,bigend,skip);
-    if (codepoint < 0) return false;
-    if (codepoint == 0) break;		// Terminator
-    writeUtf8(s, codepoint);
-    i += skip;
-    count += 1;
-    if (count >= maximumChars)
-      break;
-  }
-  return true;
-}
-
 } // End namespace ghidra