GT-3407 Fix UnsupportedOp Exception with JISAutodetect charset. #1358

Fixes github issue #1358. Some character sets don't support the encoding operation.
2025-10-05 02:39:44 +02:00 · 2019-12-18 11:05:52 -05:00 · 2019-12-18 11:05:52 -05:00 · 93bcabe582
commit 93bcabe582
parent 4c57727282
2 changed files with 39 additions and 22 deletions
--- a/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringDataInstance.java
+++ b/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringDataInstance.java
@ -684,8 +684,8 @@ public class StringDataInstance {

 		// if we get the same number of characters out that we put into the decoder,
 		// then its a good chance there is a one-to-one correspondence between original char
-		// and decoded char.
-		boolean canRecoverOriginalCharBytes =
+		// offsets and decoded char offsets.
+		boolean isByteToStringCharEquiv =
 			stringValue.length() == ((stringBytes.length - aci.byteStartOffset) / charSize);

 		stringValue = stringLayout.shouldTrimTrailingNulls() ? trimNulls(stringValue) : stringValue;
@ -701,22 +701,10 @@ public class StringDataInstance {
 		// For each 32bit character in the java string try to add it to the StringRenderBuilder
 		for (int i = 0, strLength = stringValue.length(); i < strLength;) {
 			int codePoint = stringValue.codePointAt(i);
-			byte[] originalCharBytes;
-			if (canRecoverOriginalCharBytes) {
-				originalCharBytes = new byte[charSize];
-				System.arraycopy(stringBytes, i * charSize + aci.byteStartOffset, originalCharBytes,
-					0, charSize);
-			}
-			else {
-				// can't get original bytes, cheat and run the codePoint through the charset
-				// to get what should be the same as the original bytes.
-				String singleCharStr = new String(new int[] { codePoint }, 0, 1);
-				originalCharBytes = convertStringToBytes(singleCharStr, aci);
-			}

 			RENDER_ENUM currentCharRenderSetting = renderSetting;
-			if (codePoint == StringUtilities.UNICODE_REPLACEMENT && canRecoverOriginalCharBytes &&
-				isMismatchedCharBytes(originalCharBytes, codePoint)) {
+			if (codePoint == StringUtilities.UNICODE_REPLACEMENT && isByteToStringCharEquiv &&
+				!isReplacementCharAt(stringBytes, i * charSize + aci.byteStartOffset)) {
 				// if this is a true decode error and we can recover the original bytes,
 				// then force the render mode to byte seq.
 				currentCharRenderSetting = RENDER_ENUM.BYTE_SEQ;
@ -753,7 +741,8 @@ public class StringDataInstance {
 						strBuf.addCodePointChar(codePoint);
 						break;
 					case BYTE_SEQ:
-						strBuf.addByteSeq(originalCharBytes);
+						strBuf.addByteSeq(getOriginalBytes(isByteToStringCharEquiv, i, codePoint,
+							stringBytes, aci));
 						break;
 					case ESC_SEQ:
 						strBuf.addEscapedCodePoint(codePoint);
@ -779,6 +768,26 @@ public class StringDataInstance {
 		return prefix + strBuf.toString();
 	}

+	private byte[] getOriginalBytes(boolean isByteToStringCharEquiv, int charOffset, int codePoint,
+			byte[] stringBytes, AdjustedCharsetInfo aci) {
+
+		if (isByteToStringCharEquiv) {
+			byte[] originalCharBytes = new byte[charSize];
+			System.arraycopy(stringBytes, charOffset * charSize + aci.byteStartOffset,
+				originalCharBytes, 0, charSize);
+			return originalCharBytes;
+		}
+
+		// can't get original bytes, cheat and run the codePoint through the charset
+		// to get what should be the same as the original bytes.
+		String singleCharStr = new String(new int[] { codePoint }, 0, 1);
+		Charset cs = Charset.isSupported(aci.charsetName) ? Charset.forName(aci.charsetName) : null;
+		if (cs == null || !cs.canEncode()) {
+			return null;
+		}
+		return singleCharStr.getBytes(cs);
+	}
+
 	/**
 	 * Trims trailing nulls off the end of the string.
 	 *
@ -837,10 +846,13 @@ public class StringDataInstance {
 			StringRenderBuilder.DOUBLE_QUOTE);
 	}

-	private boolean isMismatchedCharBytes(byte[] originalCharBytes, int codePoint) {
-		long originalValue = DataConverter.getInstance(buf.isBigEndian()).getValue(
-			originalCharBytes, Math.min(charSize, originalCharBytes.length));
-		return originalValue != codePoint;
+	private boolean isReplacementCharAt(byte[] stringBytes, int byteOffset) {
+		if (byteOffset + charSize > stringBytes.length) {
+			return false;
+		}
+		long origCodePointValue = DataConverter.getInstance(buf.isBigEndian()).getValue(stringBytes,
+			byteOffset, charSize);
+		return origCodePointValue == StringUtilities.UNICODE_REPLACEMENT;
 	}

 	private static String getTranslatedStringRepresentation(String translatedString) {
--- a/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringRenderBuilder.java
+++ b/Ghidra/Framework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringRenderBuilder.java
@ -112,9 +112,14 @@ public class StringRenderBuilder {
 	 * <p>
 	 * {@literal { 0, 1, 2 } -> 00,01,02}
 	 *
-	 * @param bytes
+	 * @param bytes to convert to hex and append.  If null, append "???"
 	 */
 	public void addByteSeq(byte[] bytes) {
+		if (bytes == null) {
+			ensureByteMode();
+			sb.append("???");
+			return;
+		}
 		for (int i = 0; i < bytes.length; i++) {
 			ensureByteMode();
 			String valStr = Integer.toHexString(bytes[i] & 0xff).toUpperCase();