Added BOM detection/encoding detection via aws_text_encoding_detect (#…

…647) * Added BOM detection/encoding detection via aws_text_encoding_detect * Added aws_text_is_utf8 and tests, as well as ascii resource test
awslabs · Jun 12, 2020 · 9765afb · 9765afb
1 parent 412d0cb
commit 9765afb
Show file tree

Hide file tree

Showing 8 changed files with 184 additions and 0 deletions.
diff --git a/include/aws/common/encoding.h b/include/aws/common/encoding.h
@@ -140,6 +140,24 @@ AWS_STATIC_IMPL void aws_write_u16(uint16_t value, uint8_t *buffer);
  */
 AWS_STATIC_IMPL uint16_t aws_read_u16(const uint8_t *buffer);
 
+enum aws_text_encoding {
+    AWS_TEXT_UNKNOWN,
+    AWS_TEXT_UTF8,
+    AWS_TEXT_UTF16,
+    AWS_TEXT_UTF32,
+    AWS_TEXT_ASCII,
+};
+
+/* Checks the BOM in the buffer to see if encoding can be determined. If there is no BOM or
+ * it is unrecognizable, then AWS_TEXT_UNKNOWN will be returned.
+ */
+AWS_STATIC_IMPL enum aws_text_encoding aws_text_detect_encoding(const uint8_t *bytes, size_t size);
+
+/*
+ * Returns true if the supplied bytes are encoded as UTF8 or ASCII.
+ */
+AWS_STATIC_IMPL bool aws_text_is_utf8(const uint8_t *bytes, size_t size);
+
 #ifndef AWS_NO_STATIC_IMPL
 #    include <aws/common/encoding.inl>
 #endif /* AWS_NO_STATIC_IMPL */

diff --git a/include/aws/common/encoding.inl b/include/aws/common/encoding.inl
@@ -109,6 +109,44 @@ AWS_STATIC_IMPL uint16_t aws_read_u16(const uint8_t *buffer) {
     return aws_ntoh16(value);
 }
 
+/* Reference: https://unicodebook.readthedocs.io/guess_encoding.html */
+AWS_STATIC_IMPL enum aws_text_encoding aws_text_detect_encoding(const uint8_t *bytes, size_t size) {
+    static const char *UTF_8_BOM = "\xEF\xBB\xBF";
+    static const char *UTF_16_BE_BOM = "\xFE\xFF";
+    static const char *UTF_16_LE_BOM = "\xFF\xFE";
+    static const char *UTF_32_BE_BOM = "\x00\x00\xFE\xFF";
+    static const char *UTF_32_LE_BOM = "\xFF\xFE\x00\x00";
+
+    if (size >= 3) {
+        if (memcmp(bytes, UTF_8_BOM, 3) == 0)
+            return AWS_TEXT_UTF8;
+    }
+    if (size >= 4) {
+        if (memcmp(bytes, UTF_32_LE_BOM, 4) == 0)
+            return AWS_TEXT_UTF32;
+        if (memcmp(bytes, UTF_32_BE_BOM, 4) == 0)
+            return AWS_TEXT_UTF32;
+    }
+    if (size >= 2) {
+        if (memcmp(bytes, UTF_16_LE_BOM, 2) == 0)
+            return AWS_TEXT_UTF16;
+        if (memcmp(bytes, UTF_16_BE_BOM, 2) == 0)
+            return AWS_TEXT_UTF16;
+    }
+    size_t idx = 0;
+    for (; idx < size; ++idx) {
+        if (bytes[idx] & 0x80) {
+            return AWS_TEXT_UNKNOWN;
+        }
+    }
+    return AWS_TEXT_ASCII;
+}
+
+AWS_STATIC_IMPL bool aws_text_is_utf8(const uint8_t *bytes, size_t size) {
+    enum aws_text_encoding encoding = aws_text_detect_encoding(bytes, size);
+    return encoding == AWS_TEXT_UTF8 || encoding == AWS_TEXT_ASCII;
+}
+
 AWS_EXTERN_C_END
 
 #endif /*  AWS_COMMON_ENCODING_INL */
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -136,6 +136,10 @@ add_test_case(uint16_buffer_test)
 add_test_case(uint16_buffer_non_aligned_test)
 add_test_case(uint16_buffer_signed_positive_test)
 add_test_case(uint16_buffer_signed_negative_test)
+add_test_case(text_encoding_utf8)
+add_test_case(text_encoding_utf16)
+add_test_case(text_encoding_ascii)
+add_test_case(text_encoding_is_utf8)
 
 add_test_case(scheduler_cleanup_cancellation)
 add_test_case(scheduler_ordering_test)
@@ -449,3 +453,8 @@ endif()
 
 file(GLOB FUZZ_TESTS    "fuzz/*.c")
 aws_add_fuzz_tests("${FUZZ_TESTS}" "" "")
+
+# Resources to use for testing.
+add_custom_command(TARGET ${PROJECT_NAME}-tests PRE_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy_directory
+        ${CMAKE_CURRENT_SOURCE_DIR}/resources ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/tests/encoding_test.c b/tests/encoding_test.c
@@ -997,3 +997,120 @@ static int s_hex_encoding_append_dynamic_test_case_empty(struct aws_allocator *a
 }
 
 AWS_TEST_CASE(hex_encoding_append_dynamic_test_case_empty, s_hex_encoding_append_dynamic_test_case_empty)
+
+static int read_file_contents(struct aws_byte_buf *out_buf, struct aws_allocator *alloc, const char *filename) {
+    AWS_ZERO_STRUCT(*out_buf);
+    FILE *fp = fopen(filename, "r");
+
+    if (fp) {
+        if (fseek(fp, 0L, SEEK_END)) {
+            fclose(fp);
+            return AWS_OP_ERR;
+        }
+
+        size_t allocation_size = (size_t)ftell(fp) + 1;
+        /* Tell the user that we allocate here and if success they're responsible for the free. */
+        if (aws_byte_buf_init(out_buf, alloc, allocation_size)) {
+            fclose(fp);
+            return AWS_OP_ERR;
+        }
+
+        /* Ensure compatibility with null-terminated APIs, but don't consider
+         * the null terminator part of the length of the payload */
+        out_buf->len = out_buf->capacity - 1;
+        out_buf->buffer[out_buf->len] = 0;
+
+        if (fseek(fp, 0L, SEEK_SET)) {
+            aws_byte_buf_clean_up(out_buf);
+            fclose(fp);
+            return AWS_OP_ERR;
+        }
+
+        size_t read = fread(out_buf->buffer, 1, out_buf->len, fp);
+        fclose(fp);
+        if (read < out_buf->len) {
+            aws_byte_buf_clean_up(out_buf);
+            return AWS_OP_ERR;
+        }
+
+        return AWS_OP_SUCCESS;
+    }
+
+    return AWS_OP_ERR;
+}
+
+static int s_text_encoding_utf8(struct aws_allocator *allocator, void *ctx) {
+    struct aws_byte_buf contents;
+    ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./utf8.txt"));
+    ASSERT_INT_EQUALS(AWS_TEXT_UTF8, aws_text_detect_encoding(contents.buffer, contents.len));
+    aws_byte_buf_clean_up(&contents);
+    return 0;
+}
+
+AWS_TEST_CASE(text_encoding_utf8, s_text_encoding_utf8)
+
+static int s_text_encoding_utf16(struct aws_allocator *allocator, void *ctx) {
+    struct aws_byte_buf contents;
+
+    ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./utf16le.txt"));
+    ASSERT_INT_EQUALS(AWS_TEXT_UTF16, aws_text_detect_encoding(contents.buffer, contents.len));
+    aws_byte_buf_clean_up(&contents);
+
+    ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./utf16be.txt"));
+    ASSERT_INT_EQUALS(AWS_TEXT_UTF16, aws_text_detect_encoding(contents.buffer, contents.len));
+    aws_byte_buf_clean_up(&contents);
+
+    return 0;
+}
+
+AWS_TEST_CASE(text_encoding_utf16, s_text_encoding_utf16)
+
+static int s_text_encoding_ascii(struct aws_allocator *allocator, void *ctx) {
+    char all_ascii_chars[128];
+    for (char c = 0; c < AWS_ARRAY_SIZE(all_ascii_chars); ++c) {
+        all_ascii_chars[(int)c] = (c + 1) % 128;
+    }
+
+    ASSERT_INT_EQUALS(
+        AWS_TEXT_ASCII, aws_text_detect_encoding((const uint8_t *)all_ascii_chars, AWS_ARRAY_SIZE(all_ascii_chars)));
+
+    struct aws_byte_buf contents;
+    ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./ascii.txt"));
+    ASSERT_INT_EQUALS(AWS_TEXT_ASCII, aws_text_detect_encoding(contents.buffer, contents.len));
+    aws_byte_buf_clean_up(&contents);
+
+    return 0;
+}
+
+AWS_TEST_CASE(text_encoding_ascii, s_text_encoding_ascii)
+
+static int s_text_encoding_is_utf8(struct aws_allocator *allocator, void *ctx) {
+    {
+        struct aws_byte_buf contents;
+        ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./utf8.txt"));
+        ASSERT_TRUE(aws_text_is_utf8(contents.buffer, contents.len));
+        aws_byte_buf_clean_up(&contents);
+    }
+    {
+        struct aws_byte_buf contents;
+        ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./ascii.txt"));
+        ASSERT_TRUE(aws_text_is_utf8(contents.buffer, contents.len));
+        aws_byte_buf_clean_up(&contents);
+    }
+    {
+        struct aws_byte_buf contents;
+        ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./utf16be.txt"));
+        ASSERT_FALSE(aws_text_is_utf8(contents.buffer, contents.len));
+        aws_byte_buf_clean_up(&contents);
+    }
+    {
+        struct aws_byte_buf contents;
+        ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./utf16le.txt"));
+        ASSERT_FALSE(aws_text_is_utf8(contents.buffer, contents.len));
+        aws_byte_buf_clean_up(&contents);
+    }
+
+    return 0;
+}
+
+AWS_TEST_CASE(text_encoding_is_utf8, s_text_encoding_is_utf8)
diff --git a/tests/resources/ascii.txt b/tests/resources/ascii.txt
@@ -0,0 +1 @@
+This is some text encoded as ASCII.
diff --git a/tests/resources/utf16be.txt b/tests/resources/utf16be.txt
diff --git a/tests/resources/utf16le.txt b/tests/resources/utf16le.txt
diff --git a/tests/resources/utf8.txt b/tests/resources/utf8.txt
@@ -0,0 +1 @@
+This is some text encoded in UTF8.