Zurück

Ensure the behaviour of a parser

The following Java class gets a mysterious parser, whose behaviour is unknown.

public class CharEncodingTest
{

    @Test
    public void testUTF8CharReadAsCP1252Char() throws Exception
    {
        MysteriousXMLParser parser = new MysteriousXMLParser();

        /*
         * RIGHT DOUBLE QUOTATION MARK -> UTF-8 Hex Bytes E2 80 9D
         */
        String charUnderTest = "”";

        byte[] charUnderTestInUTF8 = StandardCharsets.UTF_8.encode(charUnderTest).array();
        assertTrue(bytesAsHex(charUnderTestInUTF8).equals("00e20080009d"));

        File multibytesFromFilterRegion = new File("c:\\tmp\\multibytesFromFilterRegion.txt");
        byte[] newlineBytes = StandardCharsets.UTF_8.encode(System.lineSeparator()).array();
        FileUtils.writeByteArrayToFile(multibytesFromFilterRegion, charUnderTestInUTF8, true);
        FileUtils.writeByteArrayToFile(multibytesFromFilterRegion, newlineBytes, true);

        String input = readLine(multibytesFromFilterRegion, "UTF-8");
        assertTrue(input.equals(charUnderTest));
        assertEquals(input, parser.preprocessAndSanitizeLine(input));

        input = readLine(multibytesFromFilterRegion, Charset.defaultCharset().toString());
        System.out.println("parser.preprocessAndSanitizeLine(input): " + parser.preprocessAndSanitizeLine(input));
        assertEquals(input, parser.preprocessAndSanitizeLine(input));
        assertFalse(input.equals(charUnderTest));

        boolean deleteSuccess = multibytesFromFilterRegion.delete();
        assertTrue(deleteSuccess);

    }

    private String readLine(final File multibytesFromFilterRegion, final String charSet) throws FileNotFoundException, UnsupportedEncodingException, IOException
    {
        FileInputStream fis = new FileInputStream(multibytesFromFilterRegion);
        InputStreamReader isr = new InputStreamReader(fis, charSet);
        BufferedReader br = new BufferedReader(isr);
        String input = br.readLine();
        br.close();
        return input;
    }

    private String bytesAsHex(final byte[] ba)
    {
        StringBuilder hexStringBuilder = new StringBuilder();
        for (int i = 0; i < ba.length; i++)
        {
            String hex = String.format("%04x", ba[i]);
            hexStringBuilder.append(hex);
        }
        return hexStringBuilder.toString();
    }

    @Test
    public void testSizeOfPrimitiveTypes() throws Exception
    {
        System.out.println("Size of byte: " + (Byte.SIZE / 8) + " bytes.");
        System.out.println("Size of short: " + (Short.SIZE / 8) + " bytes.");
        System.out.println("Size of int: " + (Integer.SIZE / 8) + " bytes.");
        System.out.println("Size of long: " + (Long.SIZE / 8) + " bytes.");
        System.out.println("Size of char: " + (Character.SIZE / 8) + " bytes.");
        System.out.println("Size of float: " + (Float.SIZE / 8) + " bytes.");
        System.out.println("Size of double: " + (Double.SIZE / 8) + " bytes.");
    }

    @Test
    public void testMultibyteCharsReaderAndWriterHasCorrectEncoding() throws Exception
    {
        PrintStream utf8out = new PrintStream(System.out, true, "UTF-8");
        List<String> characters = new ArrayList<>();
        List<byte[]> charactersUtf8Bytes = new ArrayList<>();
        char currentChar = 0;
        int charsetSize = 536;
        for (int i = 0; i < charsetSize; i++)
        {
            String currentCharAsString = String.valueOf(currentChar);
            characters.add(currentCharAsString);
            charactersUtf8Bytes.add(StandardCharsets.UTF_8.encode(currentCharAsString).array());
            utf8out.print(currentChar);
            if (i % 80 == 0)
            {
                utf8out.print(System.lineSeparator());
            }
            currentChar++;
        }
        utf8out.print(System.lineSeparator());
        System.out.println(characters.size() + " chars created");

        char greaterThanValue = 0xD7FF;
        char lessThanValue = 0xE000;
        System.out.println("char filter ranges from " + (greaterThanValue + 1) + " to " + (lessThanValue - 1));

        List<byte[]> multibyteCharactersStartingWithCharFromFilteredRegion = new ArrayList<>();

        for (byte[] ca : charactersUtf8Bytes)
        {
            if (ca.length > 1)
            {
                currentChar = concatBytesToChar(ca);
            }
            if (currentChar > greaterThanValue && currentChar < lessThanValue)
            {
                multibyteCharactersStartingWithCharFromFilteredRegion.add(ca);
            }
        }

        System.out.println(multibyteCharactersStartingWithCharFromFilteredRegion.size() + " multibyte chars in filtered region:");

        for (byte[] ba : multibyteCharactersStartingWithCharFromFilteredRegion)
        {
            printFirstCharAsHex(ba);
            utf8out.print(StandardCharsets.UTF_8.decode(ByteBuffer.wrap(ba)) + ",");

            if (multibyteCharactersStartingWithCharFromFilteredRegion.indexOf(ba) % 6 == 0)
            {
                System.out.print(System.lineSeparator());
            }
        }
        System.out.print(System.lineSeparator());

        File multibytesFromFilterRegion = new File("c:\\tmp\\multibytesFromFilterRegion.txt");
        byte[] newlineBytes = StandardCharsets.UTF_8.encode(System.lineSeparator()).array();
        int windowSize = 12;
        if (!multibyteCharactersStartingWithCharFromFilteredRegion.isEmpty() && multibyteCharactersStartingWithCharFromFilteredRegion.size() > 12)
        {
            for (int i = 0; i < multibyteCharactersStartingWithCharFromFilteredRegion.size() - 12; i++)
            {
                for (int j = 0; j < windowSize; j++)
                {
                    FileUtils.writeByteArrayToFile(multibytesFromFilterRegion, multibyteCharactersStartingWithCharFromFilteredRegion.get(i + j), true);
                }
                FileUtils.writeByteArrayToFile(multibytesFromFilterRegion, newlineBytes, true);
            }
        }

        if (multibytesFromFilterRegion != null)
        {
            FileInputStream fis = new FileInputStream(multibytesFromFilterRegion); //
            InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
            BufferedReader br = new BufferedReader(isr);
            XMLVorgangsParser parser = new XMLVorgangsParser();
            String input = br.readLine();
            while (input != null)
            {
                try
                {
                    assertEquals(input, parser.preprocessAndSanitizeLine(input));
                }
                catch (AssertionError e)
                {
                    utf8out.println("failed on input " + input + " output " + parser.preprocessAndSanitizeLine(input));
                    utf8out.println("unknown bytes ignored: " + decodeText(input, StandardCharset.UTF_8, CodingErrorAction.IGNORE));
                }
                input = br.readLine();
            }
            br.close();
        }

        boolean success = multibytesFromFilterRegion.delete();
        assertTrue(success);

    }

    private void printFirstCharAsHex(final byte[] ba)
    {
        System.out.print("[B@");
        char concatBytesToChar = concatBytesToChar(ba);
        int concatBytesToInt = concatBytesToChar;
        String hex = String.format("%04x", concatBytesToInt);
        System.out.print(hex);
        System.out.print("]");
    }

    private char concatBytesToChar(final byte[] ca)
    {
        char currentChar;
        currentChar = (char) (ca[0] << 8);
        currentChar += ca[1];
        return currentChar;
    }

    @Test
    public void testByteShiftBitwise() throws Exception
    {
        char value = 0xd8;
        char target = 0;
        int number = target;
        for (int i = 0; i < 9; i++)
        {
            System.out.print(value + " shifted " + i + " times: ");
            target = (char) (value << i);
            number = target;
            String hex = String.format("%04x", number);
            System.out.println("\t" + hex + "\t" + target + "\t" + number);
        }
    }

    String decodeText(final String input, final Charset charset, final CodingErrorAction codingErrorAction) throws IOException
    {
        CharsetDecoder charsetDecoder = charset.newDecoder();
        charsetDecoder.onMalformedInput(codingErrorAction);
        return new BufferedReader(new InputStreamReader(new ByteArrayInputStream(input.getBytes(charset)), charsetDecoder)).readLine();
    }
}