CharEncodingTest

public class CharEncodingTest
{

   @Test
   public void testUTF8CharReadAsCP1252Char() throws Exception
   {
       XMLVorgangsParser parser = new XMLVorgangsParser();

       /*
        * RIGHT DOUBLE QUOTATION MARK -> UTF-8 Hex Bytes E2 80 9D
        */
       String charUnderTest = "”";

       byte[] charUnderTestInUTF8 = StandardCharsets.UTF_8.encode(charUnderTest).array();
       assertTrue(bytesAsHex(charUnderTestInUTF8).equals("00e20080009d"));

       File multibytesFromFilterRegion = new File("c:\\tmp\\multibytesFromFilterRegion.txt");
       byte[] newlineBytes = StandardCharsets.UTF_8.encode(System.lineSeparator()).array();
       FileUtils.writeByteArrayToFile(multibytesFromFilterRegion, charUnderTestInUTF8, true);
       FileUtils.writeByteArrayToFile(multibytesFromFilterRegion, newlineBytes, true);

       String input = readLine(multibytesFromFilterRegion, "UTF-8");
       assertTrue(input.equals(charUnderTest));
       assertEquals(input, parser.preprocessAndSanitizeLine(input));

       input = readLine(multibytesFromFilterRegion, Charset.defaultCharset().toString());
       System.out.println("parser.preprocessAndSanitizeLine(input): " + parser.preprocessAndSanitizeLine(input));
       assertEquals(input, parser.preprocessAndSanitizeLine(input));
       assertFalse(input.equals(charUnderTest));

       boolean deleteSuccess = multibytesFromFilterRegion.delete();
       assertTrue(deleteSuccess);

   }

   private String readLine(final File multibytesFromFilterRegion, final String charSet) throws FileNotFoundException, UnsupportedEncodingException, IOException
   {
       FileInputStream fis = new FileInputStream(multibytesFromFilterRegion);
       InputStreamReader isr = new InputStreamReader(fis, charSet);
       BufferedReader br = new BufferedReader(isr);
       String input = br.readLine();
       br.close();
       return input;
   }

   private String bytesAsHex(final byte[] ba)
   {
       StringBuilder hexStringBuilder = new StringBuilder();
       for (int i = 0; i < ba.length; i++)
       {
           String hex = String.format("%04x", ba[i]);
           hexStringBuilder.append(hex);
       }
       return hexStringBuilder.toString();
   }

   @Test
   public void testSizeOfPrimitiveTypes() throws Exception
   {
       System.out.println("Size of byte: " + (Byte.SIZE / 8) + " bytes.");
       System.out.println("Size of short: " + (Short.SIZE / 8) + " bytes.");
       System.out.println("Size of int: " + (Integer.SIZE / 8) + " bytes.");
       System.out.println("Size of long: " + (Long.SIZE / 8) + " bytes.");
       System.out.println("Size of char: " + (Character.SIZE / 8) + " bytes.");
       System.out.println("Size of float: " + (Float.SIZE / 8) + " bytes.");
       System.out.println("Size of double: " + (Double.SIZE / 8) + " bytes.");
   }

   @Test
   public void testMultibyteCharsReaderAndWriterHasCorrectEncoding() throws Exception
   {
       PrintStream utf8out = new PrintStream(System.out, true, "UTF-8");
       List<String> characters = new ArrayList<>();
       List<byte[]> charactersUtf8Bytes = new ArrayList<>();
       char currentChar = 0;
       int charsetSize = 536;
       for (int i = 0; i < charsetSize; i++)
       {
           String currentCharAsString = String.valueOf(currentChar);
           characters.add(currentCharAsString);
           charactersUtf8Bytes.add(StandardCharsets.UTF_8.encode(currentCharAsString).array());
           utf8out.print(currentChar);
           if (i % 80 == 0)
           {
               utf8out.print(System.lineSeparator());
           }
           currentChar++;
       }
       utf8out.print(System.lineSeparator());
       System.out.println(characters.size() + " chars created");

       char greaterThanValue = 0xD7FF;
       char lessThanValue = 0xE000;
       System.out.println("char filter ranges from " + (greaterThanValue + 1) + " to " + (lessThanValue - 1));

       List<byte[]> multibyteCharactersStartingWithCharFromFilteredRegion = new ArrayList<>();

       for (byte[] ca : charactersUtf8Bytes)
       {
           if (ca.length > 1)
           {
               currentChar = concatBytesToChar(ca);
           }
           if (currentChar > greaterThanValue && currentChar < lessThanValue)
           {
               multibyteCharactersStartingWithCharFromFilteredRegion.add(ca);
           }
       }

       System.out.println(multibyteCharactersStartingWithCharFromFilteredRegion.size() + " multibyte chars in filtered region:");

       for (byte[] ba : multibyteCharactersStartingWithCharFromFilteredRegion)
       {
           printFirstCharAsHex(ba);
           utf8out.print(StandardCharsets.UTF_8.decode(ByteBuffer.wrap(ba)) + ",");

           if (multibyteCharactersStartingWithCharFromFilteredRegion.indexOf(ba) % 6 == 0)
           {
               System.out.print(System.lineSeparator());
           }
       }
       System.out.print(System.lineSeparator());

       File multibytesFromFilterRegion = new File("c:\\tmp\\multibytesFromFilterRegion.txt");
       byte[] newlineBytes = StandardCharsets.UTF_8.encode(System.lineSeparator()).array();
       int windowSize = 12;
       if (!multibyteCharactersStartingWithCharFromFilteredRegion.isEmpty() && multibyteCharactersStartingWithCharFromFilteredRegion.size() > 12)
       {
           for (int i = 0; i < multibyteCharactersStartingWithCharFromFilteredRegion.size() - 12; i++)
           {
               for (int j = 0; j < windowSize; j++)
               {
                   FileUtils.writeByteArrayToFile(multibytesFromFilterRegion, multibyteCharactersStartingWithCharFromFilteredRegion.get(i + j), true);
               }
               FileUtils.writeByteArrayToFile(multibytesFromFilterRegion, newlineBytes, true);
           }
       }

       if (multibytesFromFilterRegion != null)
       {
           FileInputStream fis = new FileInputStream(multibytesFromFilterRegion); //
           InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
           BufferedReader br = new BufferedReader(isr);
           XMLVorgangsParser parser = new XMLVorgangsParser();
           String input = br.readLine();
           while (input != null)
           {
               try
               {
                   assertEquals(input, parser.preprocessAndSanitizeLine(input));
               }
               catch (AssertionError e)
               {
                   utf8out.println("failed on input " + input + " output " + parser.preprocessAndSanitizeLine(input));
                   utf8out.println("unknown bytes ignored: " + decodeText(input, StandardCharset.UTF_8, CodingErrorAction.IGNORE));
               }
               input = br.readLine();
           }
           br.close();
       }

       boolean success = multibytesFromFilterRegion.delete();
       assertTrue(success);

   }

   private void printFirstCharAsHex(final byte[] ba)
   {
       System.out.print("[B@");
       char concatBytesToChar = concatBytesToChar(ba);
       int concatBytesToInt = concatBytesToChar;
       String hex = String.format("%04x", concatBytesToInt);
       System.out.print(hex);
       System.out.print("]");
   }

   private char concatBytesToChar(final byte[] ca)
   {
       char currentChar;
       currentChar = (char) (ca[0] << 8);
       currentChar += ca[1];
       return currentChar;
   }

   @Test
   public void testByteShiftBitwise() throws Exception
   {
       char value = 0xd8;
       char target = 0;
       int number = target;
       for (int i = 0; i < 9; i++)
       {
           System.out.print(value + " shifted " + i + " times: ");
           target = (char) (value << i);
           number = target;
           String hex = String.format("%04x", number);
           System.out.println("\t" + hex + "\t" + target + "\t" + number);
       }
   }

   String decodeText(final String input, final Charset charset, final CodingErrorAction codingErrorAction) throws IOException
   {
       CharsetDecoder charsetDecoder = charset.newDecoder();
       charsetDecoder.onMalformedInput(codingErrorAction);
       return new BufferedReader(new InputStreamReader(new ByteArrayInputStream(input.getBytes(charset)), charsetDecoder)).readLine();
   }
}