Skip to content

Commit

Permalink
Drop extra __index_level_0__ column from delta
Browse files Browse the repository at this point in the history
  • Loading branch information
jrbourbeau committed Feb 15, 2024
1 parent 7cc950b commit b156012
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion pipeline/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import dask
import deltalake
import pandas as pd
import pyarrow as pa
from prefect import flow, task
from prefect.tasks import exponential_backoff

Expand Down Expand Up @@ -36,8 +37,9 @@ def json_file_to_parquet(file):
df = pd.read_json(file, lines=True)
outfile = STAGING_PARQUET_DIR / file.parent.name
fs.makedirs(outfile.parent, exist_ok=True)
data = pa.Table.from_pandas(df, preserve_index=False)
deltalake.write_deltalake(
outfile, df, mode="append", storage_options=storage_options
outfile, data, mode="append", storage_options=storage_options
)
print(f"Saved {outfile}")
return file
Expand Down

0 comments on commit b156012

Please sign in to comment.