Skip to content

Commit

Permalink
Added BOM detection/encoding detection via aws_text_encoding_detect (#…
Browse files Browse the repository at this point in the history
…647)

* Added BOM detection/encoding detection via aws_text_encoding_detect

* Added aws_text_is_utf8 and tests, as well as ascii resource test
  • Loading branch information
Justin Boswell authored Jun 12, 2020
1 parent 412d0cb commit 9765afb
Show file tree
Hide file tree
Showing 8 changed files with 184 additions and 0 deletions.
18 changes: 18 additions & 0 deletions include/aws/common/encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,24 @@ AWS_STATIC_IMPL void aws_write_u16(uint16_t value, uint8_t *buffer);
*/
AWS_STATIC_IMPL uint16_t aws_read_u16(const uint8_t *buffer);

enum aws_text_encoding {
AWS_TEXT_UNKNOWN,
AWS_TEXT_UTF8,
AWS_TEXT_UTF16,
AWS_TEXT_UTF32,
AWS_TEXT_ASCII,
};

/* Checks the BOM in the buffer to see if encoding can be determined. If there is no BOM or
* it is unrecognizable, then AWS_TEXT_UNKNOWN will be returned.
*/
AWS_STATIC_IMPL enum aws_text_encoding aws_text_detect_encoding(const uint8_t *bytes, size_t size);

/*
* Returns true if the supplied bytes are encoded as UTF8 or ASCII.
*/
AWS_STATIC_IMPL bool aws_text_is_utf8(const uint8_t *bytes, size_t size);

#ifndef AWS_NO_STATIC_IMPL
# include <aws/common/encoding.inl>
#endif /* AWS_NO_STATIC_IMPL */
Expand Down
38 changes: 38 additions & 0 deletions include/aws/common/encoding.inl
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,44 @@ AWS_STATIC_IMPL uint16_t aws_read_u16(const uint8_t *buffer) {
return aws_ntoh16(value);
}

/* Reference: https://unicodebook.readthedocs.io/guess_encoding.html */
AWS_STATIC_IMPL enum aws_text_encoding aws_text_detect_encoding(const uint8_t *bytes, size_t size) {
static const char *UTF_8_BOM = "\xEF\xBB\xBF";
static const char *UTF_16_BE_BOM = "\xFE\xFF";
static const char *UTF_16_LE_BOM = "\xFF\xFE";
static const char *UTF_32_BE_BOM = "\x00\x00\xFE\xFF";
static const char *UTF_32_LE_BOM = "\xFF\xFE\x00\x00";

if (size >= 3) {
if (memcmp(bytes, UTF_8_BOM, 3) == 0)
return AWS_TEXT_UTF8;
}
if (size >= 4) {
if (memcmp(bytes, UTF_32_LE_BOM, 4) == 0)
return AWS_TEXT_UTF32;
if (memcmp(bytes, UTF_32_BE_BOM, 4) == 0)
return AWS_TEXT_UTF32;
}
if (size >= 2) {
if (memcmp(bytes, UTF_16_LE_BOM, 2) == 0)
return AWS_TEXT_UTF16;
if (memcmp(bytes, UTF_16_BE_BOM, 2) == 0)
return AWS_TEXT_UTF16;
}
size_t idx = 0;
for (; idx < size; ++idx) {
if (bytes[idx] & 0x80) {
return AWS_TEXT_UNKNOWN;
}
}
return AWS_TEXT_ASCII;
}

AWS_STATIC_IMPL bool aws_text_is_utf8(const uint8_t *bytes, size_t size) {
enum aws_text_encoding encoding = aws_text_detect_encoding(bytes, size);
return encoding == AWS_TEXT_UTF8 || encoding == AWS_TEXT_ASCII;
}

AWS_EXTERN_C_END

#endif /* AWS_COMMON_ENCODING_INL */
9 changes: 9 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ add_test_case(uint16_buffer_test)
add_test_case(uint16_buffer_non_aligned_test)
add_test_case(uint16_buffer_signed_positive_test)
add_test_case(uint16_buffer_signed_negative_test)
add_test_case(text_encoding_utf8)
add_test_case(text_encoding_utf16)
add_test_case(text_encoding_ascii)
add_test_case(text_encoding_is_utf8)

add_test_case(scheduler_cleanup_cancellation)
add_test_case(scheduler_ordering_test)
Expand Down Expand Up @@ -449,3 +453,8 @@ endif()

file(GLOB FUZZ_TESTS "fuzz/*.c")
aws_add_fuzz_tests("${FUZZ_TESTS}" "" "")

# Resources to use for testing.
add_custom_command(TARGET ${PROJECT_NAME}-tests PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory
${CMAKE_CURRENT_SOURCE_DIR}/resources ${CMAKE_CURRENT_BINARY_DIR})
117 changes: 117 additions & 0 deletions tests/encoding_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -997,3 +997,120 @@ static int s_hex_encoding_append_dynamic_test_case_empty(struct aws_allocator *a
}

AWS_TEST_CASE(hex_encoding_append_dynamic_test_case_empty, s_hex_encoding_append_dynamic_test_case_empty)

static int read_file_contents(struct aws_byte_buf *out_buf, struct aws_allocator *alloc, const char *filename) {
AWS_ZERO_STRUCT(*out_buf);
FILE *fp = fopen(filename, "r");

if (fp) {
if (fseek(fp, 0L, SEEK_END)) {
fclose(fp);
return AWS_OP_ERR;
}

size_t allocation_size = (size_t)ftell(fp) + 1;
/* Tell the user that we allocate here and if success they're responsible for the free. */
if (aws_byte_buf_init(out_buf, alloc, allocation_size)) {
fclose(fp);
return AWS_OP_ERR;
}

/* Ensure compatibility with null-terminated APIs, but don't consider
* the null terminator part of the length of the payload */
out_buf->len = out_buf->capacity - 1;
out_buf->buffer[out_buf->len] = 0;

if (fseek(fp, 0L, SEEK_SET)) {
aws_byte_buf_clean_up(out_buf);
fclose(fp);
return AWS_OP_ERR;
}

size_t read = fread(out_buf->buffer, 1, out_buf->len, fp);
fclose(fp);
if (read < out_buf->len) {
aws_byte_buf_clean_up(out_buf);
return AWS_OP_ERR;
}

return AWS_OP_SUCCESS;
}

return AWS_OP_ERR;
}

static int s_text_encoding_utf8(struct aws_allocator *allocator, void *ctx) {
struct aws_byte_buf contents;
ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./utf8.txt"));
ASSERT_INT_EQUALS(AWS_TEXT_UTF8, aws_text_detect_encoding(contents.buffer, contents.len));
aws_byte_buf_clean_up(&contents);
return 0;
}

AWS_TEST_CASE(text_encoding_utf8, s_text_encoding_utf8)

static int s_text_encoding_utf16(struct aws_allocator *allocator, void *ctx) {
struct aws_byte_buf contents;

ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./utf16le.txt"));
ASSERT_INT_EQUALS(AWS_TEXT_UTF16, aws_text_detect_encoding(contents.buffer, contents.len));
aws_byte_buf_clean_up(&contents);

ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./utf16be.txt"));
ASSERT_INT_EQUALS(AWS_TEXT_UTF16, aws_text_detect_encoding(contents.buffer, contents.len));
aws_byte_buf_clean_up(&contents);

return 0;
}

AWS_TEST_CASE(text_encoding_utf16, s_text_encoding_utf16)

static int s_text_encoding_ascii(struct aws_allocator *allocator, void *ctx) {
char all_ascii_chars[128];
for (char c = 0; c < AWS_ARRAY_SIZE(all_ascii_chars); ++c) {
all_ascii_chars[(int)c] = (c + 1) % 128;
}

ASSERT_INT_EQUALS(
AWS_TEXT_ASCII, aws_text_detect_encoding((const uint8_t *)all_ascii_chars, AWS_ARRAY_SIZE(all_ascii_chars)));

struct aws_byte_buf contents;
ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./ascii.txt"));
ASSERT_INT_EQUALS(AWS_TEXT_ASCII, aws_text_detect_encoding(contents.buffer, contents.len));
aws_byte_buf_clean_up(&contents);

return 0;
}

AWS_TEST_CASE(text_encoding_ascii, s_text_encoding_ascii)

static int s_text_encoding_is_utf8(struct aws_allocator *allocator, void *ctx) {
{
struct aws_byte_buf contents;
ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./utf8.txt"));
ASSERT_TRUE(aws_text_is_utf8(contents.buffer, contents.len));
aws_byte_buf_clean_up(&contents);
}
{
struct aws_byte_buf contents;
ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./ascii.txt"));
ASSERT_TRUE(aws_text_is_utf8(contents.buffer, contents.len));
aws_byte_buf_clean_up(&contents);
}
{
struct aws_byte_buf contents;
ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./utf16be.txt"));
ASSERT_FALSE(aws_text_is_utf8(contents.buffer, contents.len));
aws_byte_buf_clean_up(&contents);
}
{
struct aws_byte_buf contents;
ASSERT_SUCCESS(read_file_contents(&contents, allocator, "./utf16le.txt"));
ASSERT_FALSE(aws_text_is_utf8(contents.buffer, contents.len));
aws_byte_buf_clean_up(&contents);
}

return 0;
}

AWS_TEST_CASE(text_encoding_is_utf8, s_text_encoding_is_utf8)
1 change: 1 addition & 0 deletions tests/resources/ascii.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text encoded as ASCII.
Binary file added tests/resources/utf16be.txt
Binary file not shown.
Binary file added tests/resources/utf16le.txt
Binary file not shown.
1 change: 1 addition & 0 deletions tests/resources/utf8.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text encoded in UTF8.

0 comments on commit 9765afb

Please sign in to comment.