/* ### * IP: GHIDRA * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "stringmanage.hh" #include "architecture.hh" #include "crc32.hh" namespace ghidra { AttributeId ATTRIB_TRUNC = AttributeId("trunc",69); ElementId ELEM_BYTES = ElementId("bytes",83); ElementId ELEM_STRING = ElementId("string",84); ElementId ELEM_STRINGMANAGE = ElementId("stringmanage",85); /// Assume the buffer contains a null terminated unicode encoded string. /// Write the characters out (as UTF8) to the stream. /// \param s is the output stream /// \param buffer is the given byte buffer /// \param size is the number of bytes in the buffer /// \param charsize specifies the encoding (1=UTF8 2=UTF16 4=UTF32) /// \param bigend is \b true if (UTF16 and UTF32) are big endian encoded /// \return \b true if the byte array contains valid unicode bool StringManager::writeUnicode(ostream &s,const uint1 *buffer,int4 size,int4 charsize,bool bigend) { int4 i=0; int4 count=0; int4 skip = charsize; while(i= maximumChars) break; } return true; } /// \brief Translate and assign raw string data to a StringData object /// /// The string data is provided as raw bytes. The data is translated to UTF-8 and truncated /// to the \b maximumChars allowed by the manager. The encoding must be legal unicode as performed /// by checkCharacters(). /// \param data is the StringData object to populate /// \param buf is the raw byte array /// \param size is the number of bytes in the array /// \param charsize is the size of unicode encoding /// \param numChars is the number of characters in the encoding as returned by checkCharacters() /// \param bigend is \b true if UTF-16 and UTF-32 elements are big endian encoded void StringManager::assignStringData(StringData &data,const uint1 *buf,int4 size,int4 charsize,int4 numChars,bool bigend) { if (charsize == 1 && numChars < maximumChars) { data.byteData.reserve(size); data.byteData.assign(buf,buf+size); } else { // We need to translate to UTF8 and/or truncate ostringstream s; if (!writeUnicode(s, buf, size, charsize, bigend)) return; string resString = s.str(); int4 newSize = resString.size(); data.byteData.reserve(newSize + 1); const uint1 *ptr = (const uint1 *)resString.c_str(); data.byteData.assign(ptr,ptr+newSize); data.byteData[newSize] = 0; // Make sure there is a null terminator } data.isTruncated = (numChars >= maximumChars); } /// \brief Calculate hash of a specific Address and contents of a byte array /// /// Calculate a 32-bit CRC of the bytes and XOR into the upper part of the Address offset. /// \param addr is the specific Address /// \param buf is a pointer to the array of bytes /// \param size is the number of bytes in the array /// \return the 64-bit hash uint8 StringManager::calcInternalHash(const Address &addr,const uint1 *buf,int4 size) { uint4 reg = 0x7b7c66a9; for(int4 i=0;i 21) throw LowlevelError("Bad unicode codepoint"); if (bits < 12) { // Encode with two bytes bytes[0] = 0xc0 ^ ((codepoint >> 6)&0x1f); bytes[1] = 0x80 ^ (codepoint & 0x3f); size = 2; } else if (bits < 17) { bytes[0] = 0xe0 ^ ((codepoint >> 12)&0xf); bytes[1] = 0x80 ^ ((codepoint >> 6)&0x3f); bytes[2] = 0x80 ^ (codepoint & 0x3f); size = 3; } else { bytes[0] = 0xf0 ^ ((codepoint >> 18) & 7); bytes[1] = 0x80 ^ ((codepoint >> 12) & 0x3f); bytes[2] = 0x80 ^ ((codepoint >> 6) & 0x3f); bytes[3] = 0x80 ^ (codepoint & 0x3f); size = 4; } s.write((char *)bytes, size); } /// Returns \b true if the data is some kind of complete string. /// A given character data-type can be used as a hint for the encoding. /// The string decoding can be cached internally. /// \param addr is the given address /// \param charType is the given character data-type /// \return \b true if the address represents string data bool StringManager::isString(const Address &addr,Datatype *charType) { bool isTrunc; // unused here const vector &buffer(getStringData(addr,charType,isTrunc)); return !buffer.empty(); } /// \brief Associate string data at a code address or other location that doesn't hold string data normally /// /// The given byte buffer is decoded, and if it represents a legal string, a non-zero hash is returned, /// constructed from an Address associated with the string and the string data itself. The registered string /// can be retrieved via the getStringData() method using this hash as a constant Address. If the string is not /// legal, 0 is returned. /// \param addr is the address to associate with the string data /// \param buf is a pointer to the array of raw bytes encoding the string /// \param size is the number of bytes in the array /// \param charType is a character data-type indicating the encoding /// \return a hash associated with the string or 0 uint8 StringManager::registerInternalStringData(const Address &addr,const uint1 *buf,int4 size,Datatype *charType) { int4 charsize = charType->getSize(); int4 numChars = checkCharacters(buf, size, charsize, addr.isBigEndian()); if (numChars < 0) return 0; // Not a legal encoding uint8 hash = calcInternalHash(addr, buf, size); Address constAddr = addr.getSpace()->getManager()->getConstant(hash); StringData &stringData( stringMap[constAddr] ); stringData.byteData.clear(); stringData.isTruncated = false; assignStringData(stringData, buf, size, charsize, numChars, addr.isBigEndian()); return hash; } /// Encode \ element, with \ children. /// \param encoder is the stream encoder void StringManager::encode(Encoder &encoder) const { encoder.openElement(ELEM_STRINGMANAGE); map::const_iterator iter1; for(iter1=stringMap.begin();iter1!=stringMap.end();++iter1) { encoder.openElement(ELEM_STRING); (*iter1).first.encode(encoder); const StringData &stringData( (*iter1).second ); encoder.openElement(ELEM_BYTES); encoder.writeBool(ATTRIB_TRUNC, stringData.isTruncated); ostringstream s; s << '\n' << setfill('0'); for(int4 i=0;i element, with \ children. /// \param decoder is the stream decoder void StringManager::decode(Decoder &decoder) { uint4 elemId = decoder.openElement(ELEM_STRINGMANAGE); for (;;) { uint4 subId = decoder.openElement(); if (subId != ELEM_STRING) break; Address addr = Address::decode(decoder); StringData &stringData(stringMap[addr]); uint4 subId2 = decoder.openElement(ELEM_BYTES); stringData.isTruncated = decoder.readBool(ATTRIB_TRUNC); istringstream is(decoder.readString(ATTRIB_CONTENT)); int4 val; char c1, c2; is >> ws; c1 = is.get(); c2 = is.get(); while ((c1 > 0) && (c2 > 0)) { if (c1 <= '9') c1 = c1 - '0'; else if (c1 <= 'F') c1 = c1 + 10 - 'A'; else c1 = c1 + 10 - 'a'; if (c2 <= '9') c2 = c2 - '0'; else if (c2 <= 'F') c2 = c2 + 10 - 'A'; else c2 = c2 + 10 - 'a'; val = c1 * 16 + c2; stringData.byteData.push_back((uint1) val); is >> ws; c1 = is.get(); c2 = is.get(); } decoder.closeElement(subId2); decoder.closeElement(subId); } decoder.closeElement(elemId); } /// \param buffer is the byte buffer /// \param size is the number of bytes in the buffer /// \param charsize is the presumed size (in bytes) of character elements /// \return \b true if a string terminator is found bool StringManager::hasCharTerminator(const uint1 *buffer,int4 size,int4 charsize) { for(int4 i=0;i=0xD800)&&(codepoint<=0xDBFF)) { // high surrogate int4 trail=readUtf16(buf+2,bigend); sk += 2; if ((trail<0xDC00)||(trail>0xDFFF)) return -1; // Bad trail codepoint = (codepoint<<10) + trail + (0x10000 - (0xD800 << 10) - 0xDC00); } else if ((codepoint>=0xDC00)&&(codepoint<=0xDFFF)) return -1; // trail before high } else if (charsize==1) { // UTF-8 int4 val = buf[0]; if ((val&0x80)==0) { codepoint = val; sk = 1; } else if ((val&0xe0)==0xc0) { int4 val2 = buf[1]; sk = 2; if ((val2&0xc0)!=0x80) return -1; // Not a valid UTF8-encoding codepoint = ((val&0x1f)<<6) | (val2 & 0x3f); } else if ((val&0xf0)==0xe0) { int4 val2 = buf[1]; int4 val3 = buf[2]; sk = 3; if (((val2&0xc0)!=0x80)||((val3&0xc0)!=0x80)) return -1; // invalid encoding codepoint = ((val&0xf)<<12) | ((val2&0x3f)<<6) | (val3 & 0x3f); } else if ((val&0xf8)==0xf0) { int4 val2 = buf[1]; int4 val3 = buf[2]; int4 val4 = buf[3]; sk = 4; if (((val2&0xc0)!=0x80)||((val3&0xc0)!=0x80)||((val4&0xc0)!=0x80)) return -1; // invalid encoding codepoint = ((val&7)<<18) | ((val2&0x3f)<<12) | ((val3&0x3f)<<6) | (val4 & 0x3f); } else return -1; } else if (charsize == 4) { // UTF-32 sk = 4; if (bigend) codepoint = (buf[0]<<24) + (buf[1]<<16) + (buf[2]<<8) + buf[3]; else codepoint = (buf[3]<<24) + (buf[2]<<16) + (buf[1]<<8) + buf[0]; } else return -1; if (codepoint >= 0xd800) { if (codepoint > 0x10ffff) // Bigger than maximum codepoint return -1; if (codepoint <= 0xdfff) return -1; // Reserved for surrogates, invalid codepoints } skip = sk; return codepoint; } /// \param g is the underlying architecture (and loadimage) /// \param max is the maximum number of bytes to allow in a decoded string StringManagerUnicode::StringManagerUnicode(Architecture *g,int4 max) : StringManager(max) { glb = g; testBuffer = new uint1[max]; } StringManagerUnicode::~StringManagerUnicode(void) { delete [] testBuffer; } const vector &StringManagerUnicode::getStringData(const Address &addr,Datatype *charType,bool &isTrunc) { map::iterator iter; iter = stringMap.find(addr); if (iter != stringMap.end()) { isTrunc = (*iter).second.isTruncated; return (*iter).second.byteData; } StringData &stringData(stringMap[addr]); // Allocate (initially empty) byte vector stringData.isTruncated = false; isTrunc = false; if (charType->isOpaqueString()) // Cannot currently test for an opaque encoding return stringData.byteData; // Return the empty buffer int4 curBufferSize = 0; int4 charsize = charType->getSize(); bool foundTerminator = false; try { do { int4 amount = 32; // Grab 32 bytes of image at a time uint4 newBufferSize = curBufferSize + amount; if (newBufferSize > maximumChars) { newBufferSize = maximumChars; amount = newBufferSize - curBufferSize; if (amount == 0) { return stringData.byteData; // Could not find terminator } } glb->loader->loadFill(testBuffer + curBufferSize, amount, addr + curBufferSize); foundTerminator = hasCharTerminator(testBuffer + curBufferSize, amount, charsize); curBufferSize = newBufferSize; } while (!foundTerminator); } catch (DataUnavailError &err) { return stringData.byteData; // Return the empty buffer } int4 numChars = checkCharacters(testBuffer, curBufferSize, charsize, addr.isBigEndian()); if (numChars < 0) return stringData.byteData; // Return the empty buffer (invalid encoding) assignStringData(stringData, testBuffer, curBufferSize, charsize, numChars, addr.isBigEndian()); isTrunc = stringData.isTruncated; return stringData.byteData; } } // End namespace ghidra