From fa4bea871086db70a8d19820a2f266de826836e1 Mon Sep 17 00:00:00 2001 From: Data Psycho Date: Tue, 29 Nov 2022 17:44:22 +0100 Subject: [PATCH] Adding more dataframe example to read csv files (#4360) * Adding more dataframe example to read csv files * Update typo in the example * Formatted changes with rustfmt * Update datafusion-examples/examples/dataframe.rs Co-authored-by: Martin Grigorov * Update datafusion-examples/examples/dataframe.rs Co-authored-by: Martin Grigorov * Completed the incomplete instruction, formatted the content as suggested * Update datafusion-examples/examples/dataframe.rs Co-authored-by: Andrew Lamb * Fix the type in default import for Csv Reader Option schema * csv file creation into a separate function and then call it into another function * Resolving clippy error for unnecessary let statement * Resolving clippy unused variable error Co-authored-by: ALAMSHC Co-authored-by: Martin Grigorov Co-authored-by: Andrew Lamb --- datafusion-examples/examples/dataframe.rs | 57 +++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/datafusion-examples/examples/dataframe.rs b/datafusion-examples/examples/dataframe.rs index 5cdec9b88716..a212387e2162 100644 --- a/datafusion-examples/examples/dataframe.rs +++ b/datafusion-examples/examples/dataframe.rs @@ -15,8 +15,11 @@ // specific language governing permissions and limitations // under the License. +use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::error::Result; use datafusion::prelude::*; +use std::fs; +use std::sync::Arc; /// This example demonstrates executing a simple query against an Arrow data source (Parquet) and /// fetching results, using the DataFrame trait @@ -39,5 +42,59 @@ async fn main() -> Result<()> { // print the results df.show().await?; + // Reading CSV file with inferred schema example + let csv_df = example_read_csv_file_with_inferred_schema().await; + csv_df.show().await?; + + // Reading CSV file with defined schema + let csv_df = example_read_csv_file_with_schema().await; + csv_df.show().await?; + Ok(()) } + +// Function to create an test CSV file +fn create_csv_file(path: String) { + // Create the data to put into the csv file with headers + let content = r#"id,time,vote,unixtime,rating +a1,"10 6, 2013",3,1381017600,5.0 +a2,"08 9, 2013",2,1376006400,4.5"#; + // write the data + fs::write(path, content).expect("Problem with writing file!"); +} + +// Example to read data from a csv file with inferred schema +async fn example_read_csv_file_with_inferred_schema() -> Arc { + let path = "example.csv"; + // Create a csv file using the predefined function + create_csv_file(path.to_string()); + // Create a session context + let ctx = SessionContext::new(); + // Register a lazy DataFrame using the context + ctx.read_csv(path, CsvReadOptions::default()).await.unwrap() +} + +// Example to read csv file with a defined schema for the csv file +async fn example_read_csv_file_with_schema() -> Arc { + let path = "example.csv"; + // Create a csv file using the predefined function + create_csv_file(path.to_string()); + // Create a session context + let ctx = SessionContext::new(); + // Define the schema + let schema = Schema::new(vec![ + Field::new("id", DataType::Utf8, false), + Field::new("time", DataType::Utf8, false), + Field::new("vote", DataType::Int32, true), + Field::new("unixtime", DataType::Int64, false), + Field::new("rating", DataType::Float32, true), + ]); + // Create a csv option provider with the desired schema + let csv_read_option = CsvReadOptions { + // Update the option provider with the defined schema + schema: Some(&schema), + ..Default::default() + }; + // Register a lazy DataFrame by using the context and option provider + ctx.read_csv(path, csv_read_option).await.unwrap() +}