Skip to content

Commit

Permalink
Fix empty map & list ingestion (#873)
Browse files Browse the repository at this point in the history
  • Loading branch information
sfc-gh-alhuang authored Oct 25, 2024
1 parent fba8ca7 commit b1f76d9
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,11 @@ private static ParquetBufferValue get3LevelListValue(
listVal.add(Collections.singletonList(parsedValue.getValue()));
estimatedParquetSize += parsedValue.getSize();
}
if (listVal.isEmpty()) {
subColumnFinder
.getSubColumns(path)
.forEach(subColumn -> statsMap.get(subColumn).incCurrentNullCount());
}
return new ParquetBufferValue(listVal, estimatedParquetSize);
}

Expand Down Expand Up @@ -539,6 +544,11 @@ private static ParquetBufferValue get3LevelMapValue(
listVal.add(Arrays.asList(parsedKey.getValue(), parsedValue.getValue()));
estimatedParquetSize += parsedKey.getSize() + parsedValue.getSize();
}
if (listVal.isEmpty()) {
subColumnFinder
.getSubColumns(path)
.forEach(subColumn -> statsMap.get(subColumn).incCurrentNullCount());
}
return new ParquetBufferValue(listVal, estimatedParquetSize);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,9 @@ Double getCurrentMaxRealValue() {

void incCurrentNullCount() {
this.currentNullCount += 1;
if (enableValuesCount) {
numberOfValues++;
}
}

long getCurrentNullCount() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -389,13 +389,14 @@ private void writeValues(List<?> values, GroupType type) {
}
} else {
/* Struct */
recordConsumer.startGroup();
if (val instanceof List) {
writeValues((List<?>) val, cols.get(i).asGroupType());
} else {
if (!(val instanceof List)) {
throw new ParquetEncodingException(
String.format("Field %s should be a 2 level struct", fieldName));
}
recordConsumer.startGroup();
if (!((List<?>) val).isEmpty()) {
writeValues((List<?>) val, cols.get(i).asGroupType());
}
recordConsumer.endGroup();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ public void testGetCombinedStats() throws Exception {
Assert.assertEquals(BigInteger.valueOf(1), result.getCurrentMinIntValue());
Assert.assertEquals(BigInteger.valueOf(8), result.getCurrentMaxIntValue());
Assert.assertEquals(enableNDVAndNV ? 7 : -1, result.getDistinctValues());
Assert.assertEquals(enableNDVAndNV ? 8 : -1, result.getNumberOfValues());
Assert.assertEquals(enableNDVAndNV ? 10 : -1, result.getNumberOfValues());

Assert.assertEquals(2, result.getCurrentNullCount());
Assert.assertNull(result.getCurrentMinStrValue());
Expand Down Expand Up @@ -239,7 +239,7 @@ public void testGetCombinedStats() throws Exception {
Assert.assertEquals(2, result.getCurrentNullCount());
Assert.assertEquals(5, result.getCurrentMaxLength());
Assert.assertEquals(enableNDVAndNV ? 7 : -1, result.getDistinctValues());
Assert.assertEquals(enableNDVAndNV ? 8 : -1, result.getNumberOfValues());
Assert.assertEquals(enableNDVAndNV ? 10 : -1, result.getNumberOfValues());

Assert.assertNull(result.getCurrentMinRealValue());
Assert.assertNull(result.getCurrentMaxRealValue());
Expand All @@ -264,7 +264,7 @@ public void testGetCombinedStatsNull() throws Exception {
Assert.assertEquals(BigInteger.valueOf(2), result.getCurrentMinIntValue());
Assert.assertEquals(BigInteger.valueOf(8), result.getCurrentMaxIntValue());
Assert.assertEquals(enableNDVAndNV ? 4 : -1, result.getDistinctValues());
Assert.assertEquals(enableNDVAndNV ? 4 : -1, result.getNumberOfValues());
Assert.assertEquals(enableNDVAndNV ? 6 : -1, result.getNumberOfValues());

Assert.assertEquals(2, result.getCurrentNullCount());

Expand Down Expand Up @@ -309,7 +309,7 @@ public void testGetCombinedStatsNull() throws Exception {
Assert.assertArrayEquals("g".getBytes(StandardCharsets.UTF_8), result.getCurrentMaxStrValue());

Assert.assertEquals(enableNDVAndNV ? 4 : -1, result.getDistinctValues());
Assert.assertEquals(enableNDVAndNV ? 4 : -1, result.getNumberOfValues());
Assert.assertEquals(enableNDVAndNV ? 5 : -1, result.getNumberOfValues());
Assert.assertEquals(1, result.getCurrentNullCount());

Assert.assertNull(result.getCurrentMinRealValue());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2097,23 +2097,23 @@ private void testStructuredStatsE2EHelper(AbstractRowBuffer<?> rowBuffer) {
.isEqualTo("key2".getBytes(StandardCharsets.UTF_8));
assertThat(columnEpStats.get("COLMAP.key_value.key").getCurrentNullCount()).isEqualTo(1);
assertThat(columnEpStats.get("COLMAP.key_value.key").getDistinctValues()).isEqualTo(2);
assertThat(columnEpStats.get("COLMAP.key_value.key").getNumberOfValues()).isEqualTo(2);
assertThat(columnEpStats.get("COLMAP.key_value.key").getNumberOfValues()).isEqualTo(3);

assertThat(columnEpStats.get("COLMAP.key_value.value").getCurrentMinIntValue())
.isEqualTo(BigInteger.ONE);
assertThat(columnEpStats.get("COLMAP.key_value.value").getCurrentMaxIntValue())
.isEqualTo(BigInteger.ONE);
assertThat(columnEpStats.get("COLMAP.key_value.value").getCurrentNullCount()).isEqualTo(1);
assertThat(columnEpStats.get("COLMAP.key_value.value").getDistinctValues()).isEqualTo(1);
assertThat(columnEpStats.get("COLMAP.key_value.value").getNumberOfValues()).isEqualTo(2);
assertThat(columnEpStats.get("COLMAP.key_value.value").getNumberOfValues()).isEqualTo(3);

assertThat(columnEpStats.get("COLARRAY.list.element").getCurrentMinIntValue())
.isEqualTo(BigInteger.ONE);
assertThat(columnEpStats.get("COLARRAY.list.element").getCurrentMaxIntValue())
.isEqualTo(BigInteger.ONE);
assertThat(columnEpStats.get("COLARRAY.list.element").getCurrentNullCount()).isEqualTo(1);
assertThat(columnEpStats.get("COLARRAY.list.element").getDistinctValues()).isEqualTo(1);
assertThat(columnEpStats.get("COLARRAY.list.element").getNumberOfValues()).isEqualTo(4);
assertThat(columnEpStats.get("COLARRAY.list.element").getNumberOfValues()).isEqualTo(5);
}

private static Thread getThreadThatWaitsForLockReleaseAndFlushes(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ public void testStructuredDataType() throws Exception {
"object(a int, b string, c boolean)", "{\"a\": 1, \"b\": \"test\", \"c\": true}");
assertStructuredDataType("map(string, int)", "{\"key1\": 1}");
assertStructuredDataType("array(int)", "[1, 2, 3]");
assertStructuredDataType("array(string) not null", "[]");
assertStructuredDataType("map(string, int) not null", "{}");
assertMap(
"map(string, int)",
new HashMap<String, Integer>() {
Expand Down Expand Up @@ -95,59 +97,30 @@ public void testStructuredDataType() throws Exception {
.isInstanceOf(SFException.class)
.extracting("vendorCode")
.isEqualTo(ErrorCode.INVALID_FORMAT_ROW.getMessageCode());
}

/* Nested data types. Should be fixed. Fixed in server side. */
Assertions.assertThatThrownBy(
() -> assertStructuredDataType("array(array(int))", "[[1, 2], [3, 4]]"))
.isInstanceOf(SFException.class);
Assertions.assertThatThrownBy(
() ->
assertStructuredDataType(
"array(map(string, int))", "[{\"key1\": 1}, {\"key2\": 2}]"))
.isInstanceOf(SFException.class);
Assertions.assertThatThrownBy(
() ->
assertStructuredDataType(
"array(object(a int, b string, c boolean))",
"[{\"a\": 1, \"b\": \"test\", \"c\": true}]"))
.isInstanceOf(SFException.class);
Assertions.assertThatThrownBy(
() ->
assertStructuredDataType(
"map(string, object(a int, b string, c boolean))",
"{\"key1\": {\"a\": 1, \"b\": \"test\", \"c\": true}}"))
.isInstanceOf(SFException.class);
Assertions.assertThatThrownBy(
() -> assertStructuredDataType("map(string, array(int))", "{\"key1\": [1, 2, 3]}"))
.isInstanceOf(SFException.class);
Assertions.assertThatThrownBy(
() ->
assertStructuredDataType(
"map(string, map(string, int))", "{\"key1\": {\"key2\": 2}}"))
.isInstanceOf(SFException.class);
Assertions.assertThatThrownBy(
() ->
assertStructuredDataType(
"map(string, array(array(int)))", "{\"key1\": [[1, 2], [3, 4]]}"))
.isInstanceOf(SFException.class);
Assertions.assertThatThrownBy(
() ->
assertStructuredDataType(
"map(string, array(map(string, int)))",
"{\"key1\": [{\"key2\": 2}, {\"key3\": 3}]}"))
.isInstanceOf(SFException.class);
Assertions.assertThatThrownBy(
() ->
assertStructuredDataType(
"map(string, array(object(a int, b string, c boolean)))",
"{\"key1\": [{\"a\": 1, \"b\": \"test\", \"c\": true}]}"))
.isInstanceOf(SFException.class);
Assertions.assertThatThrownBy(
() ->
assertStructuredDataType(
"object(a int, b array(int), c map(string, int))",
"{\"a\": 1, \"b\": [1, 2, 3], \"c\": {\"key1\": 1}}"))
.isInstanceOf(SFException.class);
@Test
public void testNestedDataType() throws Exception {
assertStructuredDataType("map(string, map(string, object(a int, b string)))", "{}");
assertStructuredDataType("array(map(string, int))", "[{}]");
assertStructuredDataType("array(array(int))", "[[1, 2], [3, 4]]");
assertStructuredDataType("array(map(string, int))", "[{\"key1\": 1}, {\"key2\": 2}]");
assertStructuredDataType(
"array(object(a int, b string, c boolean))", "[{\"a\": 1, \"b\": \"test\", \"c\": true}]");
assertStructuredDataType(
"map(string, object(a int, b string, c boolean))",
"{\"key1\": {\"a\": 1, \"b\": \"test\", \"c\": true}}");
assertStructuredDataType("map(string, array(int))", "{\"key1\": [1, 2, 3]}");
assertStructuredDataType("map(string, map(string, int))", "{\"key1\": {\"key2\": 2}}");
assertStructuredDataType("map(string, array(array(int)))", "{\"key1\": [[1, 2], [3, 4]]}");
assertStructuredDataType(
"map(string, array(map(string, int)))", "{\"key1\": [{\"key2\": 2}, {\"key3\": 3}]}");
assertStructuredDataType(
"map(string, array(object(a int, b string, c boolean)))",
"{\"key1\": [{\"a\": 1, \"b\": \"test\", \"c\": true}]}");
assertStructuredDataType(
"object(a int, b array(int), c map(string, int))",
"{\"a\": 1, \"b\": [1, 2, 3], \"c\": {\"key1\": 1}}");
}

private void assertStructuredDataType(String dataType, String value) throws Exception {
Expand Down

0 comments on commit b1f76d9

Please sign in to comment.