Skip to content

Commit

Permalink
fix: correctly convert nullability of nested parquet fields to arrow
Browse files Browse the repository at this point in the history
The `parquet_to_arrow_schema` function tries to set the `nullable` flag of
`Field` according to the parquet repetition levels. That flag is then used
via the `InitNested` `enum` to calculate the level at which data is valid.

In the following parquet message type, neither the list nor its elements
should be marked as `nullable`, the definition level for the `REPEATED`
group only indicates an empty list but not any nulls.

```
message eventlog {
  REQUIRED group events (LIST) {
    REPEATED group array {
      REQUIRED BYTE_ARRAY event_name (STRING);
      REQUIRED INT64 event_time (TIMESTAMP(MILLIS,true));
    }
  }
}
```

The nullability of fields was asserted in multiple tests, but those
assertions did not match the comments.

This is a port of jorgecarleitao/arrow2#1565 to the nano-arrow crate.
  • Loading branch information
jhorstmann committed Oct 9, 2023
1 parent 05d9eb2 commit 980dcf3
Showing 1 changed file with 57 additions and 15 deletions.
72 changes: 57 additions & 15 deletions crates/nano-arrow/src/io/parquet/read/schema/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -343,12 +343,12 @@ fn to_list(
let field = fields.first().unwrap();
(
&field.get_field_info().name,
field.get_field_info().repetition != Repetition::Required,
field.get_field_info().repetition == Repetition::Optional,
)
},
_ => (
&item.get_field_info().name,
item.get_field_info().repetition != Repetition::Required,
item.get_field_info().repetition == Repetition::Optional,
),
};

Expand Down Expand Up @@ -596,7 +596,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("element", DataType::Utf8, true))),
DataType::List(Box::new(Field::new("element", DataType::Utf8, false))),
true,
));
}
Expand All @@ -608,7 +608,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("element", DataType::Int32, true))),
DataType::List(Box::new(Field::new("element", DataType::Int32, false))),
true,
));
}
Expand All @@ -627,7 +627,7 @@ mod tests {
]);
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("element", arrow_struct, true))),
DataType::List(Box::new(Field::new("element", arrow_struct, false))),
true,
));
}
Expand All @@ -643,7 +643,7 @@ mod tests {
let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("array", arrow_struct, true))),
DataType::List(Box::new(Field::new("array", arrow_struct, false))),
true,
));
}
Expand All @@ -659,7 +659,7 @@ mod tests {
let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, true))),
DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, false))),
true,
));
}
Expand All @@ -669,8 +669,50 @@ mod tests {
{
arrow_fields.push(Field::new(
"name",
DataType::List(Box::new(Field::new("name", DataType::Int32, true))),
true,
DataType::List(Box::new(Field::new("name", DataType::Int32, false))),
false,
));
}

let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
let fields = parquet_to_arrow_schema(parquet_schema.fields());

assert_eq!(arrow_fields, fields);
Ok(())
}

#[test]
fn test_parquet_list_with_struct() -> Result<()> {
let mut arrow_fields = Vec::new();

let message_type = "
message eventlog {
REQUIRED group events (LIST) {
REPEATED group array {
REQUIRED BYTE_ARRAY event_name (STRING);
REQUIRED INT64 event_time (TIMESTAMP(MILLIS,true));
}
}
}
";

{
let struct_fields = vec![
Field::new("event_name", DataType::Utf8, false),
Field::new(
"event_time",
DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
false,
),
];
arrow_fields.push(Field::new(
"events",
DataType::List(Box::new(Field::new(
"array",
DataType::Struct(struct_fields),
false,
))),
false,
));
}

Expand Down Expand Up @@ -797,9 +839,9 @@ mod tests {
DataType::List(Box::new(Field::new(
"innerGroup",
DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]),
true,
false,
))),
true,
false,
);

let outer_group_list = Field::new(
Expand All @@ -810,9 +852,9 @@ mod tests {
Field::new("leaf2", DataType::Int32, true),
inner_group_list,
]),
true,
false,
))),
true,
false,
);
arrow_fields.push(outer_group_list);
}
Expand Down Expand Up @@ -873,8 +915,8 @@ mod tests {
Field::new("string", DataType::Utf8, true),
Field::new(
"bools",
DataType::List(Box::new(Field::new("bools", DataType::Boolean, true))),
true,
DataType::List(Box::new(Field::new("bools", DataType::Boolean, false))),
false,
),
Field::new("date", DataType::Date32, true),
Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true),
Expand Down

0 comments on commit 980dcf3

Please sign in to comment.