GT-3347 - code cleanup, code review

Unify char representation and string representation code paths. Add CHAR_SEQ string layout enum. Misc javadocs.
2025-10-05 02:39:44 +02:00 · 2019-12-05 12:29:24 -05:00 · 2019-12-05 12:29:24 -05:00 · 3faebfebe8
commit 3faebfebe8
parent 603ca28c01
14 changed files with 168 additions and 222 deletions
--- a/Ghidra/Features/Base/src/main/java/ghidra/app/plugin/core/codebrowser/hover/DataTypeListingHover.java
+++ b/Ghidra/Features/Base/src/main/java/ghidra/app/plugin/core/codebrowser/hover/DataTypeListingHover.java
@ -146,7 +146,7 @@ public class DataTypeListingHover extends AbstractDataTypeHover implements Listi
 				result += String.format("<br>Translated value: %s",
 					HTMLUtilities.friendlyEncodeHTML(sdi.getTranslatedValue()));
 			}
-			if (!sdi.isPascal() && !sdi.hasNullTerminator()) {
+			if (sdi.isMissingNullTerminator()) {
 				result += "<br>Missing NULL terminator.";
 			}
 			if (sdi.getStringLength() > dataInstance.getLength()) {
--- a/Ghidra/Features/Base/src/main/java/ghidra/app/plugin/core/equate/ConvertToCharAction.java
+++ b/Ghidra/Features/Base/src/main/java/ghidra/app/plugin/core/equate/ConvertToCharAction.java
@ -49,6 +49,7 @@ public class ConvertToCharAction extends AbstractConvertAction {
 	protected String convertToString(Program program, Scalar scalar, boolean isData) {
 		byte[] bytes = scalar.byteArrayValue();

-		return StringDataInstance.getCharRepresentation(ByteDataType.dataType, bytes, null);
+		return StringDataInstance.getCharRepresentation(ByteDataType.dataType, bytes, null,
+			program.getMemory().isBigEndian());
 	}
 }
--- a/Ghidra/Features/Base/src/main/java/ghidra/app/plugin/core/hover/AbstractScalarOperandHover.java
+++ b/Ghidra/Features/Base/src/main/java/ghidra/app/plugin/core/hover/AbstractScalarOperandHover.java
@ -221,7 +221,7 @@ public abstract class AbstractScalarOperandHover extends AbstractConfigurableHov

 	private boolean hasEncodingError(String s) {
 		return s.codePoints().anyMatch(
-			codePoint -> StringUtilities.isUnicodeReplacementCodePoint(codePoint));
+			codePoint -> codePoint == StringUtilities.UNICODE_REPLACEMENT);
 	}

 	private ByteMemBufferImpl getScalarOperandAsMemBuffer(Address addr, Scalar scalar,
--- a/Ghidra/Features/Base/src/main/java/ghidra/app/plugin/core/strings/HasEncodingErrorColumnConstraint.java
+++ b/Ghidra/Features/Base/src/main/java/ghidra/app/plugin/core/strings/HasEncodingErrorColumnConstraint.java
@ -32,7 +32,7 @@ public class HasEncodingErrorColumnConstraint extends StringDataInstanceColumnCo
 	public boolean accepts(StringDataInstance value, TableFilterContext context) {
 		String s = value.getStringValue();
 		return (s != null) && s.chars().anyMatch(
-			codePoint -> StringUtilities.isUnicodeReplacementCodePoint(codePoint));
+			codePoint -> codePoint == StringUtilities.UNICODE_REPLACEMENT);
 	}

 	@Override
--- a/Ghidra/Features/Base/src/main/java/ghidra/app/plugin/core/strings/ViewStringsTableModel.java
+++ b/Ghidra/Features/Base/src/main/java/ghidra/app/plugin/core/strings/ViewStringsTableModel.java
@ -361,7 +361,7 @@ class ViewStringsTableModel extends AddressBasedTableModel<ProgramLocation> {
 			String s = StringDataInstance.getStringDataInstance(data).getStringValue();

 			return (s != null) && s.chars().anyMatch(
-				codePoint -> StringUtilities.isUnicodeReplacementCodePoint(codePoint));
+				codePoint -> codePoint == StringUtilities.UNICODE_REPLACEMENT);
 		}

 		@Override
--- a/Ghidra/Framework/Generic/src/main/java/ghidra/util/StringUtilities.java
+++ b/Ghidra/Framework/Generic/src/main/java/ghidra/util/StringUtilities.java
@ -1063,16 +1063,4 @@ public class StringUtilities {
 		}
 		return new String(new int[] { codePoint }, 0, 1);
 	}
-
-	/**
-	 * Returns true if the specified code point is the 'replacement' code point 0xFFFD,
-	 * which is used when decoding bytes into unicode chars and there was a bad or invalid
-	 * sequence that does not have a mapping. (ie. decoding byte char 0x80 as US-ASCII)
-	 *
-	 * @param codePoint to test
-	 * @return boolean true if the char is 0xFFFD (ie. UNICODE REPLACEMENT char)
-	 */
-	public static boolean isUnicodeReplacementCodePoint(int codePoint) {
-		return codePoint == UNICODE_REPLACEMENT;
-	}
 }
--- a/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/AbstractIntegerDataType.java
+++ b/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/AbstractIntegerDataType.java
@ -231,7 +231,7 @@ public abstract class AbstractIntegerDataType extends BuiltIn implements ArraySt
 		}

 		if (getFormatSettingsDefinition().getFormat(settings) == FormatSettingsDefinition.CHAR) {
-			return StringDataInstance.getCharRepresentation(this, bytes, settings);
+			return StringDataInstance.getCharRepresentation(this, bytes, settings, !isLE);
 		}

 		return getRepresentation(new BigInteger(bytes), settings, 8 * length);
--- a/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/AbstractStringDataType.java
+++ b/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/AbstractStringDataType.java
@ -15,9 +15,9 @@
 */
 package ghidra.program.model.data;

-import static ghidra.program.model.data.CharsetSettingsDefinition.CHARSET;
-import static ghidra.program.model.data.RenderUnicodeSettingsDefinition.RENDER;
-import static ghidra.program.model.data.TranslationSettingsDefinition.TRANSLATION;
+import static ghidra.program.model.data.CharsetSettingsDefinition.*;
+import static ghidra.program.model.data.RenderUnicodeSettingsDefinition.*;
+import static ghidra.program.model.data.TranslationSettingsDefinition.*;

 import ghidra.docking.settings.*;
 import ghidra.program.model.mem.MemBuffer;
@ -28,7 +28,7 @@ import ghidra.program.model.mem.MemBuffer;
 * See {@link StringDataType} for information about string variations and configuration details.
 * <p>
 * Sub-classes generally only need to implement a constructor that calls the mega-constructor
- * {@link #AbstractStringDataType(String, String, String, String, String, String, String, DataType, StringLayoutEnum, DataTypeManager) AbstractStringDataType.SAbstractStringDataType(lots,of,params)}
+ * {@link #AbstractStringDataType(String, String, String, String, String, String, String, DataType, StringLayoutEnum, DataTypeManager) AbstractStringDataType.AbstractStringDataType(lots,of,params)}
 * and the {@link DataType#clone(DataTypeManager) } method.
 * <p>
 *
--- a/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/BitFieldDataType.java
+++ b/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/BitFieldDataType.java
@ -410,14 +410,17 @@ public class BitFieldDataType extends AbstractDataType {
 		AbstractIntegerDataType intDT = (AbstractIntegerDataType) dt;
 		if (intDT.getFormatSettingsDefinition().getFormat(
 			settings) == FormatSettingsDefinition.CHAR) {
+			if (big.signum() < 0) {
+				big = big.add(BigInteger.valueOf(2).pow(bitSize));
+			}
 			int bytesLen = BitFieldDataType.getMinimumStorageSize(bitSize);
-			byte[] bytes = DataConverter.getInstance(getDataOrganization().isBigEndian()).getBytes(
-				big, bytesLen);
+			byte[] bytes = DataConverter.getInstance(buf.isBigEndian()).getBytes(big, bytesLen);
 			if (!EndianSettingsDefinition.ENDIAN.isBigEndian(settings, buf)) {
 				bytes = ArrayUtilities.reverse(bytes);
 			}

-			return StringDataInstance.getCharRepresentation(this, bytes, settings);
+			return StringDataInstance.getCharRepresentation(this, bytes, settings,
+				buf.isBigEndian());
 		}

 		return intDT.getRepresentation(big, settings, effectiveBitSize);
--- a/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringDataInstance.java
+++ b/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringDataInstance.java
@ -82,14 +82,15 @@ public class StringDataInstance {
 	 * @param dataType the {@link DataType} of the element containing the bytes (most likely a ByteDataType)
 	 * @param bytes the bytes to convert
 	 * @param settings the {@link Settings} object for the location where the bytes came from, or null
+	 * @param isBigEndian boolean flag indicating data is big endian
 	 * @return formatted string (typically with quotes around the contents): single character: 'a', multiple characters: "a\x12bc"
 	 */
-	public static String getCharRepresentation(DataType dataType, byte[] bytes, Settings settings) {
-		MemBuffer memBuf = new ByteMemBufferImpl(null, bytes, true);
-		StringDataInstance instance =
-			new StringDataInstance(dataType, settings, memBuf, bytes.length, RENDER_ENUM.ESC_SEQ);
-		return bytes.length == 1 ? instance.getCharRepresentation()
-				: instance.getCharSequenceRepresentation();
+	public static String getCharRepresentation(DataType dataType, byte[] bytes, Settings settings,
+			boolean isBigEndian) {
+		MemBuffer memBuf = new ByteMemBufferImpl(null, bytes, isBigEndian);
+		StringDataInstance sdi =
+			new StringDataInstance(dataType, settings, memBuf, bytes.length);
+		return sdi.getCharRepresentation();
 	}

 	/**
@ -223,43 +224,12 @@ public class StringDataInstance {
 		this.length = length;
 	}

-	/**
-	 * Creates a string instance using the data in the {@link MemBuffer} and the settings
-	 * pulled from the {@link AbstractStringDataType string data type} but using the given
-	 * {@link RenderUnicodeSettingsDefinition.RENDER_ENUM rendering setting}.
-	 *
-	 * @param dataType {@link AbstractStringDataType} common string base data type.
-	 * @param settings {@link Settings} attached to the data location.
-	 * @param buf {@link MemBuffer} containing the data.
-	 * @param length Length passed from the caller to the datatype.  -1 indicates a 'probe'
-	 * trying to detect the length of an unknown string, otherwise it will be the length
-	 * of the containing field of the data instance.
-	 * @param renderSettings How to render the instance contents.
-	 */
-	public StringDataInstance(DataType dataType, Settings settings, MemBuffer buf, int length,
-			RenderUnicodeSettingsDefinition.RENDER_ENUM renderSettings) {
-		settings = (settings == null) ? SettingsImpl.NO_SETTINGS : settings;
-		this.buf = buf;
-		this.charsetName = getCharsetNameFromDataTypeOrSettings(dataType, settings);
-		this.charSize = CharsetInfo.getInstance().getCharsetCharSize(charsetName);
-		// NOTE: for now only handle padding for charSize == 1
-		this.paddedCharSize =
-			charSize == 1 ? getDataOrganization(dataType).getCharSize() : charSize;
-		this.stringLayout = getLayoutFromDataType(dataType);
-		this.showTranslation = TRANSLATION.isShowTranslated(settings);
-		this.translatedValue = TRANSLATION.getTranslatedValue(settings);
-		this.renderSetting = renderSettings;
-		this.endianSetting = ENDIAN.getEndianess(settings, null);
-
-		this.length = length;
-	}
-
 	private StringDataInstance(StringDataInstance copyFrom, StringLayoutEnum newLayout,
-			MemBuffer newBuf, int newLen) {
+			MemBuffer newBuf, int newLen, String newCharsetName) {
 		this.charSize = copyFrom.charSize;
 		this.paddedCharSize = copyFrom.paddedCharSize;
 		this.translatedValue = null;
-		this.charsetName = copyFrom.charsetName;
+		this.charsetName = newCharsetName;
 		this.stringLayout = newLayout;
 		this.showTranslation = false;
 		this.renderSetting = copyFrom.renderSetting;
@ -284,13 +254,16 @@ public class StringDataInstance {
 			return ((AbstractStringDataType) dataType).getStringLayout();
 		}
 		if (dataType instanceof AbstractIntegerDataType || dataType instanceof BitFieldDataType) {
-			return StringLayoutEnum.FIXED_LEN;
+			return StringLayoutEnum.CHAR_SEQ;
 		}
 		return StringLayoutEnum.NULL_TERMINATED_BOUNDED;
 	}

 	private static String getCharsetNameFromDataTypeOrSettings(DataType dataType,
 			Settings settings) {
+		if (dataType instanceof BitFieldDataType) {
+			dataType = ((BitFieldDataType) dataType).getBaseDataType();
+		}
 		return (dataType instanceof DataTypeWithCharset)
 				? ((DataTypeWithCharset) dataType).getCharsetName(settings)
 				: DEFAULT_CHARSET_NAME;
@ -324,7 +297,7 @@ public class StringDataInstance {
 	}

 	private boolean isAlreadyDeterminedFixedLen() {
-		return length >= 0 && (stringLayout == StringLayoutEnum.FIXED_LEN);
+		return length >= 0 && stringLayout.isFixedLen();
 	}

 	public boolean isPascal() {
@ -409,16 +382,16 @@ public class StringDataInstance {
 	}

 	/**
-	 * Returns true if the string has a trailing NULL character within the data instance's
-	 * bounds.
+	 * Returns true if the string should have a trailing NULL character and doesn't.
 	 *
-	 * @return boolean true if there is a trailing NULL character.
+	 * @return boolean true if the trailing NULL character is missing, false if string type
+	 * doesn't need a trailing NULL character or if it is present.
 	 */
-	public boolean hasNullTerminator() {
+	public boolean isMissingNullTerminator() {

-		if (!isPascal()) {
+		if (stringLayout.shouldTrimTrailingNulls()) {
 			String str = getStringValueNoTrim();
-			return (str != null) && (str.length() > 0) && str.charAt(str.length() - 1) == 0;
+			return (str != null) && (str.length() > 0) && str.charAt(str.length() - 1) != 0;
 		}
 		return false;
 	}
@ -474,7 +447,7 @@ public class StringDataInstance {
 	public String getStringValue() {
 		String str = getStringValueNoTrim();

-		return (str != null) && !isPascal() ? trimNulls(str) : str;
+		return (str != null) && stringLayout.shouldTrimTrailingNulls() ? trimNulls(str) : str;
 	}

 	private String getStringValueNoTrim() {
@ -492,7 +465,7 @@ public class StringDataInstance {
 	}

 	private byte[] getStringBytes() {
-		return isPascal() ? getPascalCharBytes() : getNormalStringCharBytes();
+		return stringLayout.isPascal() ? getPascalCharBytes() : getNormalStringCharBytes();
 	}

 	private byte[] getNormalStringCharBytes() {
@ -680,10 +653,10 @@ public class StringDataInstance {
 	 * @return formatted String
 	 */
 	public String getStringRepresentation() {
-		return getStringRep(true);
+		return getStringRep(StringRenderBuilder.DOUBLE_QUOTE, StringRenderBuilder.DOUBLE_QUOTE);
 	}

-	private String getStringRep(boolean trimNulls) {
+	private String getStringRep(char quoteChar, char quoteCharMulti) {

 		if (isProbe() || isBadCharSize() || !buf.isInitializedMemory()) {
 			return UNKNOWN;
@ -703,15 +676,23 @@ public class StringDataInstance {
 			return UNKNOWN_DOT_DOT_DOT;
 		}

+		if (stringValue.length() == 0 && aci.byteStartOffset != 0) {
+			// If the byteStartOffset isn't zero it means there was one char that was the unicode BOM.
+			// Asking the Charset to decode it returned nothing, so force it.
+			stringValue = BOM_RESULT_STR;
+		}
+
 		// if we get the same number of characters out that we put into the decoder,
 		// then its a good chance there is a one-to-one correspondence between original char
 		// and decoded char.
 		boolean canRecoverOriginalCharBytes =
-			(stringValue.length() - aci.byteStartOffset) == (stringBytes.length / charSize);
+			stringValue.length() == ((stringBytes.length - aci.byteStartOffset) / charSize);

-		StringRenderBuilder strBuf = new StringRenderBuilder(charSize);
+		stringValue = stringLayout.shouldTrimTrailingNulls() ? trimNulls(stringValue) : stringValue;
+
+		StringRenderBuilder strBuf = new StringRenderBuilder(charSize,
+			stringValue.length() == 1 ? quoteChar : quoteCharMulti);

-		stringValue = !isPascal() && trimNulls ? trimNulls(stringValue) : stringValue;
 		if (stringValue.isEmpty() || (stringValue.length() == 1 && stringValue.charAt(0) == 0)) {
 			// force the string renderer into "string" mode so we get empty quotes when done.
 			strBuf.addString("");
@ -723,7 +704,8 @@ public class StringDataInstance {
 			byte[] originalCharBytes;
 			if (canRecoverOriginalCharBytes) {
 				originalCharBytes = new byte[charSize];
-				System.arraycopy(stringBytes, i * charSize, originalCharBytes, 0, charSize);
+				System.arraycopy(stringBytes, i * charSize + aci.byteStartOffset, originalCharBytes,
+					0, charSize);
 			}
 			else {
 				// can't get original bytes, cheat and run the codePoint through the charset
@ -732,6 +714,14 @@ public class StringDataInstance {
 				originalCharBytes = convertStringToBytes(singleCharStr, aci);
 			}

+			RENDER_ENUM currentCharRenderSetting = renderSetting;
+			if (codePoint == StringUtilities.UNICODE_REPLACEMENT && canRecoverOriginalCharBytes &&
+				isMismatchedCharBytes(originalCharBytes, codePoint)) {
+				// if this is a true decode error and we can recover the original bytes,
+				// then force the render mode to byte seq.
+				currentCharRenderSetting = RENDER_ENUM.BYTE_SEQ;
+			}
+
 			if (StringUtilities.isControlCharacterOrBackslash(codePoint)) {
 				strBuf.addString(StringUtilities.convertCodePointToEscapeSequence(codePoint));
 			}
@ -741,36 +731,24 @@ public class StringDataInstance {
 			else if (StringUtilities.isDisplayable(codePoint)) {
 				strBuf.addCodePointChar(codePoint);
 			}
-			else if (StringUtilities.isUnicodeReplacementCodePoint(codePoint)) {
-				// if this is a true decode error and we can recover the original bytes
-				// render as byte seq.
-				// Otherwise, display the <?> symbol.
-				if (canRecoverOriginalCharBytes &&
-					isMismatchedCharBytes(originalCharBytes, codePoint)) {
-					strBuf.addByteSeq(originalCharBytes);
-				}
-				else {
-					strBuf.addCodePointChar(codePoint);
-				}
-			}
 			else {
 				// not simple ascii, decide how to handle:
 				// add the character to the string in a format depending on the
 				// render settings.  ISO control chars are forced to be
 				// escaped regardless of the render setting.
-				RENDER_ENUM thisCharRenderSetting = renderSetting;
-				if (thisCharRenderSetting == RENDER_ENUM.ALL) {
+				if (currentCharRenderSetting == RENDER_ENUM.ALL) {
 					if (codePoint <= 0x7f) {
 						// render non-displayable, non-control-char ascii-ish bytes as bytes instead
 						// of as escape sequences
-						thisCharRenderSetting = RENDER_ENUM.BYTE_SEQ;
+						currentCharRenderSetting = RENDER_ENUM.BYTE_SEQ;
 					}
-					else if (Character.isISOControl(codePoint) || !Character.isDefined(codePoint)) {
-						thisCharRenderSetting = RENDER_ENUM.ESC_SEQ;
+					else if (Character.isISOControl(codePoint) || !Character.isDefined(codePoint) ||
+						codePoint == StringUtilities.UNICODE_BE_BYTE_ORDER_MARK) {
+						currentCharRenderSetting = RENDER_ENUM.ESC_SEQ;
 					}
 				}

-				switch (thisCharRenderSetting) {
+				switch (currentCharRenderSetting) {
 					case ALL:
 						strBuf.addCodePointChar(codePoint);
 						break;
@ -784,9 +762,8 @@ public class StringDataInstance {
 			}
 			i += Character.charCount(codePoint);
 		}
-		String result = strBuf.toString();
 		String prefix = "";
-		if (charsetName.startsWith("UTF") && result.startsWith("\"")) {
+		if (charsetName.startsWith("UTF") && strBuf.startsWithQuotedText()) {
 			switch (charSize) {
 				case 1:
 					prefix = "u8";
@ -799,7 +776,7 @@ public class StringDataInstance {
 					break;
 			}
 		}
-		return prefix + result;
+		return prefix + strBuf.toString();
 	}

 	/**
@ -838,116 +815,31 @@ public class StringDataInstance {
 	}

 	/**
-	 * Convert a sequence of char values in memory into a formatted string, without
-	 * stripping any nulls. 
-	 * <p>
-	 * See {@link #getCharRepresentation()} and {@link #getStringRepresentation()} 
-	 *
-	 * @return String containing the representation of the char sequence
-	 */
-	public String getCharSequenceRepresentation() {
-		return getStringRep(false);
-	}
-
-	/**
-	 * Convert a char value in memory into its canonical unicode representation, using
+	 * Convert a char value (or sequence of char values) in memory into its canonical unicode representation, using
 	 * attached charset and encoding information.
 	 * <p>
-	 * This implementation treats the char value as a 1 element long string and reuses the string
-	 * logic to read it from memory using charset info.
 	 *
-	 * @return String containing the representation of the single char.
+	 * @return String containing the representation of the char.
 	 */
 	public String getCharRepresentation() {
 		if (length < charSize /* also covers case of isProbe() */ ) {
 			return UNKNOWN_DOT_DOT_DOT;
 		}

-		byte[] charBytes = convertPaddedToUnpadded(getStringBytes());
-		if (charBytes == null) {
-			return UNKNOWN_DOT_DOT_DOT;
-		}
+		// if the charset's charsize is bigger than the number of bytes we have,
+		// discard the charset and fall back to US-ASCII
+		String newCSName = (length < charSize) ? DEFAULT_CHARSET_NAME : charsetName;

-		AdjustedCharsetInfo aci = getAdjustedCharsetInfo(charBytes);
-		String stringValue = convertBytesToString(charBytes, aci);
-		if (stringValue == null) {
-			return UNKNOWN_DOT_DOT_DOT;
-		}
+		StringDataInstance charseqSDI =
+			new StringDataInstance(this, StringLayoutEnum.CHAR_SEQ, buf, length, newCSName);

-		if (stringValue.length() == 0) {
-			if (aci.byteStartOffset == 0) {
-				return UNKNOWN;
-			}
-
-			// If the byteStartOffset isn't zero it means the char was the unicode BOM.
-			// Asking the Charset to decode it returned nothing, so force it.
-			stringValue = BOM_RESULT_STR;
-		}
-
-		int codePoint = stringValue.codePointAt(0);
-		RENDER_ENUM tmpRenderSetting = renderSetting;
-
-		StringRenderBuilder strBuf =
-			new StringRenderBuilder(charSize, StringRenderBuilder.SINGLE_QUOTE);
-		if (StringUtilities.isControlCharacterOrBackslash(codePoint)) {
-			strBuf.addString(StringUtilities.convertCodePointToEscapeSequence(codePoint));
-		}
-		else if (codePoint == 0x0000 && renderSetting != RENDER_ENUM.BYTE_SEQ) {
-			strBuf.addEscapedChar('0');
-		}
-		else if (StringUtilities.isUnicodeReplacementCodePoint(codePoint) &&
-			renderSetting != RENDER_ENUM.BYTE_SEQ) {
-			strBuf.addCodePointChar(codePoint);
-		}
-		else if (StringUtilities.isDisplayable(codePoint)) {
-			strBuf.addCodePointChar(codePoint);
-		}
-		else {
-			// not simple ascii, decide how to handle:
-			// add the character to the string in a format depending on the
-			// render settings.  ISO control chars are forced to be
-			// escaped regardless of the render setting.
-			boolean alwaysNeedsEscaping = (renderSetting == RENDER_ENUM.ALL) &&
-				(Character.isISOControl(codePoint) || !Character.isDefined(codePoint) ||
-					codePoint == StringUtilities.UNICODE_BE_BYTE_ORDER_MARK);
-
-			tmpRenderSetting = alwaysNeedsEscaping ? RENDER_ENUM.ESC_SEQ : renderSetting;
-
-			switch (tmpRenderSetting) {
-				case ALL:
-					strBuf.addCodePointChar(codePoint);
-					break;
-				case ESC_SEQ:
-					strBuf.addEscapedCodePoint(codePoint);
-					break;
-				case BYTE_SEQ:
-					// BYTE_SEQ uses original bytes of char data, not the bytes produced by the charset
-					strBuf.addByteSeq(charBytes);
-					break;
-			}
-
-		}
-		String prefix = "";
-		if (charsetName.startsWith("UTF") && tmpRenderSetting != RENDER_ENUM.BYTE_SEQ) {
-			switch (charSize) {
-				case 1:
-					prefix = "u8";
-					break;
-				case 2:
-					prefix = "u";
-					break;
-				case 4:
-					prefix = "U";
-					break;
-			}
-		}
-
-		return prefix + strBuf.toString();
+		return charseqSDI.getStringRep(StringRenderBuilder.SINGLE_QUOTE,
+			StringRenderBuilder.DOUBLE_QUOTE);
 	}

 	private boolean isMismatchedCharBytes(byte[] originalCharBytes, int codePoint) {
 		long originalValue = DataConverter.getInstance(buf.isBigEndian()).getValue(
-			originalCharBytes, originalCharBytes.length);
+			originalCharBytes, Math.min(charSize, originalCharBytes.length));
 		return originalValue != codePoint;
 	}

@ -1020,7 +912,7 @@ public class StringDataInstance {
 		}
 		int newLength = Math.max(0, length - byteOffset);
 		StringDataInstance sub = new StringDataInstance(this, getOffcutLayout(),
-			new WrappedMemBuffer(buf, byteOffset), newLength);
+			new WrappedMemBuffer(buf, byteOffset), newLength, charsetName);

 		return sub;
 	}
--- a/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringLayoutEnum.java
+++ b/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringLayoutEnum.java
@ -18,19 +18,39 @@ package ghidra.program.model.data;
 /**
 * Controls strings termination
 * <ul>
- * <li>{@link StringLayoutEnum#FIXED_LEN} (ie. fixed length, trailing nulls trimmed, interior nulls retained)
- * <li>{@link StringLayoutEnum#NULL_TERMINATED_UNBOUNDED} (ie. null terminated and ignores data instance length)
- * <li>{@link StringLayoutEnum#NULL_TERMINATED_BOUNDED} (ie. null-terminated and limited to data instance)
- * <li>{@link StringLayoutEnum#PASCAL_255} (ie. pascal string, using 1 byte for length field, max 255 char elements)
- * <li>{@link StringLayoutEnum#PASCAL_64k} (ie. pascal string, using 2 bytes for length field, max 64k char elements)
+ * <li>{@link StringLayoutEnum#FIXED_LEN}
+ * <li>{@link StringLayoutEnum#CHAR_SEQ}
+ * <li>{@link StringLayoutEnum#NULL_TERMINATED_UNBOUNDED}
+ * <li>{@link StringLayoutEnum#NULL_TERMINATED_BOUNDED}
+ * <li>{@link StringLayoutEnum#PASCAL_255}
+ * <li>{@link StringLayoutEnum#PASCAL_64k}
 * </ul>
 */
 public enum StringLayoutEnum {
+	/**
+	 * Fixed length string, trailing nulls trimmed, interior nulls retained.
+	 */
 	FIXED_LEN("fixed length"),
+	/**
+	 * Fixed length sequence of characters, all nulls retained.
+	 */
+	CHAR_SEQ("char sequence"),
+	/**
+	 * Null terminated string that ignores it's container's length when searching for terminating null character.
+	 */
 	NULL_TERMINATED_UNBOUNDED("null-terminated & unbounded"),
+	/**
+	 * Null-terminated string that is limited to it's container's length.
+	 */
 	NULL_TERMINATED_BOUNDED("null-terminated & bounded"),
-	PASCAL_255("pascal255"), // prefixed with 1 byte length field which stores number of chars (not bytes) in string
-	PASCAL_64k("pascal64k");// prefixed with 2 byte length field which stores number of chars (not bytes) in string
+	/**
+	 * Pascal string, using 1 byte for length field, max 255 char elements.
+	 */
+	PASCAL_255("pascal255"),
+	/**
+	 * Pascal string, using 2 bytes for length field, max 64k char elements
+	 */
+	PASCAL_64k("pascal64k");

 	private final String s;

@ -43,13 +63,42 @@ public enum StringLayoutEnum {
 		return s;
 	}

+	/**
+	 * Returns true if this layout is one of the pascal types.
+	 * 
+	 * @return boolean true if pascal
+	 */
 	public boolean isPascal() {
 		return this == PASCAL_255 || this == PASCAL_64k;
 	}

+	/**
+	 * Returns true if this layout is one of the null terminated types.
+	 * 
+	 * @return boolean true if null terminated string
+	 */
 	public boolean isNullTerminated() {
 		return this == NULL_TERMINATED_UNBOUNDED ||
-			this == StringLayoutEnum.NULL_TERMINATED_BOUNDED;
+			this == NULL_TERMINATED_BOUNDED;
+	}
+
+	/**
+	 * Returns true if this layout should have its trailing null characters trimmed.
+	 * 
+	 * @return boolean true if trailing nulls should be trimmed
+	 */
+	public boolean shouldTrimTrailingNulls() {
+		return this == NULL_TERMINATED_UNBOUNDED || this == NULL_TERMINATED_BOUNDED ||
+			this == FIXED_LEN;
+	}
+
+	/**
+	 * Returns true if this layout is one of the fixed-size types.
+	 * 
+	 * @return boolean true if fixed length
+	 */
+	public boolean isFixedLen() {
+		return this == FIXED_LEN || this == CHAR_SEQ;
 	}

 }
--- a/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringRenderBuilder.java
+++ b/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringRenderBuilder.java
@ -46,6 +46,18 @@ public class StringRenderBuilder {
 		this.quoteChar = quoteChar;
 	}

+	/**
+	 * Returns true if the current formatted string starts with a quoted text section,
+	 * instead of a byte value section.  Useful to indicate if
+	 * the string could have a prefix applied to it (ie. u8"text")
+	 * <p>
+	 * 
+	 * @return boolean true if this string will start with a quoted text section
+	 */
+	public boolean startsWithQuotedText() {
+		return sb.length() > 0 && sb.charAt(0) == quoteChar;
+	}
+
 	/**
 	 * Append the characters in the specified string. The added characters will
 	 * be shown in a quoted text region.
--- a/Ghidra/Framework/SoftwareModeling/src/test/java/ghidra/program/model/data/CharDataTypesRenderTest.java
+++ b/Ghidra/Framework/SoftwareModeling/src/test/java/ghidra/program/model/data/CharDataTypesRenderTest.java
@ -157,30 +157,30 @@ public class CharDataTypesRenderTest extends AbstractGTest {

 		// wchar32
 		String result = wchar32DT.getRepresentation(buf32, normset, wchar32DT.getLength());
-		assertEquals("U'\ufffd'", result);
+		assertEquals("AA,AA,AA,AA", result);

 		result = wchar32DT.getRepresentation(buf32, escseq, wchar32DT.getLength());
-		assertEquals("U'\ufffd'", result);
+		assertEquals("AA,AA,AA,AA", result);

 		result = wchar32DT.getRepresentation(buf32, byteseq, wchar32DT.getLength());
 		assertEquals("AA,AA,AA,AA", result);

 		// wchar16
 		result = wchar16DT.getRepresentation(buf16_be, normset, wchar16DT.getLength());
-		assertEquals("u'\ufffd'", result);
+		assertEquals("D8,00", result);

 		result = wchar16DT.getRepresentation(buf16_be, escseq, wchar16DT.getLength());
-		assertEquals("u'\ufffd'", result);
+		assertEquals("D8,00", result);

 		result = wchar16DT.getRepresentation(buf16_be, byteseq, wchar16DT.getLength());
 		assertEquals("D8,00", result);

 		// charDT
 		result = charDT.getRepresentation(buf8, normset, charDT.getLength());
-		assertEquals("'\ufffd'", result);
+		assertEquals("85", result);

 		result = charDT.getRepresentation(buf8, escseq, charDT.getLength());
-		assertEquals("'\ufffd'", result);
+		assertEquals("85", result);

 		result = charDT.getRepresentation(buf8, byteseq, charDT.getLength());
 		assertEquals("85", result);
@ -196,7 +196,7 @@ public class CharDataTypesRenderTest extends AbstractGTest {

 		result = wchar16DT.getRepresentation(mb(false, 0xfd, 0xff),
 			newset().set(RENDER_ENUM.ESC_SEQ), wchar16DT.getLength());
-		assertEquals("u'\uFFFD'", result);
+		assertEquals("u'\\uFFFD'", result);

 		result = wchar16DT.getRepresentation(mb(false, 0xfd, 0xff),
 			newset().set(RENDER_ENUM.BYTE_SEQ), wchar16DT.getLength());
--- a/Ghidra/Framework/SoftwareModeling/src/test/java/ghidra/program/model/data/StringDataTypeTest.java
+++ b/Ghidra/Framework/SoftwareModeling/src/test/java/ghidra/program/model/data/StringDataTypeTest.java
@ -514,45 +514,46 @@ public class StringDataTypeTest extends AbstractGTest {
 	}

 	//-------------------------------------------------------------------------------------
-	// StringDataInstance.hasNullTerminator()
+	// StringDataInstance.isMissingNullTerminator()
 	//-------------------------------------------------------------------------------------

 	@Test
 	public void testHasNullTerm() {
 		ByteMemBufferImpl buf = mb(false, 'a', 'b', 0);

-		assertTrue(mkSDI(termString, buf, newset(), buf.getLength()).hasNullTerminator());
+		assertFalse(mkSDI(termString, buf, newset(), buf.getLength()).isMissingNullTerminator());
 	}

 	@Test
 	public void testHasNullTermEOF() {
 		ByteMemBufferImpl buf = mb(false, 'a', 'b');

-		assertFalse(mkSDI(termString, buf, newset(), buf.getLength()).hasNullTerminator());
+		assertTrue(mkSDI(termString, buf, newset(), buf.getLength()).isMissingNullTerminator());
 	}

 	@Test
 	public void testHasNullTermUTF16() {
 		ByteMemBufferImpl buf = mb(false, 'a', 0, 'b', 0, 0, 0);

-		assertTrue(mkSDI(termUtf16String, buf, newset(), buf.getLength()).hasNullTerminator());
+		assertFalse(
+			mkSDI(termUtf16String, buf, newset(), buf.getLength()).isMissingNullTerminator());
 	}

 	@Test
 	public void testHasNullTermFixed() {
 		ByteMemBufferImpl buf = mb(false, 'a', 'b', 'c', 0, 0, 0);

-		assertFalse(mkSDI(fixedlenString, buf, newset(), 2).hasNullTerminator());
-		assertFalse(mkSDI(fixedlenString, buf, newset(), 3).hasNullTerminator());
-		assertTrue(mkSDI(fixedlenString, buf, newset(), 4).hasNullTerminator());
+		assertTrue(mkSDI(fixedlenString, buf, newset(), 2).isMissingNullTerminator());
+		assertTrue(mkSDI(fixedlenString, buf, newset(), 3).isMissingNullTerminator());
+		assertFalse(mkSDI(fixedlenString, buf, newset(), 4).isMissingNullTerminator());
 	}

 	@Test
 	public void testHasNullTermFixedUTF16() {
 		ByteMemBufferImpl buf = mb(false, 'a', 0, 'b', 0, 'c', 0, 0, 0, 0, 0);

-		assertFalse(mkSDI(fixedUtf16String, buf, newset(), 4).hasNullTerminator());
-		assertFalse(mkSDI(fixedUtf16String, buf, newset(), 6).hasNullTerminator());
-		assertTrue(mkSDI(fixedUtf16String, buf, newset(), 8).hasNullTerminator());
+		assertTrue(mkSDI(fixedUtf16String, buf, newset(), 4).isMissingNullTerminator());
+		assertTrue(mkSDI(fixedUtf16String, buf, newset(), 6).isMissingNullTerminator());
+		assertFalse(mkSDI(fixedUtf16String, buf, newset(), 8).isMissingNullTerminator());
 	}
 }