From 1043602e3792c8a11c68e76abd12b50f5c73baf4 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Wed, 10 Jul 2024 15:11:20 -0600 Subject: [PATCH] removed iter from parse pdf --- examples/ordinance_gpt/parse_pdf.py | 79 ++++++++++++++--------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/examples/ordinance_gpt/parse_pdf.py b/examples/ordinance_gpt/parse_pdf.py index d6a2945..83600de 100644 --- a/examples/ordinance_gpt/parse_pdf.py +++ b/examples/ordinance_gpt/parse_pdf.py @@ -24,51 +24,50 @@ # download this from https://app.box.com/s/a8oi8jotb9vnu55rzdul7e291jnn7hmq fp_pdf = 'Palo Alto Iowa.pdf' - for idx in range(10): - fp_txt_all = fp_pdf.replace('.pdf', f'_all_{idx}.txt') - fp_txt_clean = fp_pdf.replace('.pdf', f'_clean_{idx}.txt') - fp_ords = fp_pdf.replace('.pdf', f'_ords_{idx}.csv') + fp_txt_all = fp_pdf.replace('.pdf', '_all.txt') + fp_txt_clean = fp_pdf.replace('.pdf', '_clean.txt') + fp_ords = fp_pdf.replace('.pdf', '_ords.csv') - doc = PDFDocument.from_file(fp_pdf) + doc = PDFDocument.from_file(fp_pdf) - text_splitter = RecursiveCharacterTextSplitter( - RTS_SEPARATORS, - chunk_size=3000, - chunk_overlap=300, - length_function=partial(ApiBase.count_tokens, model='gpt-4'), - ) + text_splitter = RecursiveCharacterTextSplitter( + RTS_SEPARATORS, + chunk_size=3000, + chunk_overlap=300, + length_function=partial(ApiBase.count_tokens, model='gpt-4'), + ) - # setup LLM and Ordinance service/utility classes - azure_api_key, azure_version, azure_endpoint = validate_api_params() - client = openai.AsyncAzureOpenAI(api_key=azure_api_key, - api_version=azure_version, - azure_endpoint=azure_endpoint) - llm_service = OpenAIService(client, rate_limit=1e9) - services = [llm_service] - kwargs = dict(llm_service=llm_service, model='gpt-4', temperature=0) - extractor = OrdinanceExtractor(LLMCaller(**kwargs)) + # setup LLM and Ordinance service/utility classes + azure_api_key, azure_version, azure_endpoint = validate_api_params() + client = openai.AsyncAzureOpenAI(api_key=azure_api_key, + api_version=azure_version, + azure_endpoint=azure_endpoint) + llm_service = OpenAIService(client, rate_limit=1e9) + services = [llm_service] + kwargs = dict(llm_service=llm_service, model='gpt-4', temperature=0) + extractor = OrdinanceExtractor(LLMCaller(**kwargs)) - """The following three function calls present three (equivalent) ways - to call ELM async ordinance functions. The three functions 1) check - ordinance documents for relevant ordinance info, 2) extract the - relevant text, and 3) run the decision tree to get structured ordinance - data from the unstructured legal text.""" + """The following three function calls present three (equivalent) ways to + call ELM async ordinance functions. The three functions 1) check ordinance + documents for relevant ordinance info, 2) extract the relevant text, and 3) + run the decision tree to get structured ordinance data from the + unstructured legal text.""" - # 1) call async func using a partial function (`run_async`) - run_async = partial(ARun.run, services) - doc = run_async(check_for_ordinance_info(doc, text_splitter, **kwargs)) + # 1) call async func using a partial function (`run_async`) + run_async = partial(ARun.run, services) + doc = run_async(check_for_ordinance_info(doc, text_splitter, **kwargs)) - # 2) Build coroutine first the use it to call async func - # (extract_ordinance_text_with_llm is an async function) - extrct = extract_ordinance_text_with_llm(doc, text_splitter, extractor) - doc = ARun.run(services, extrct) + # 2) Build coroutine first the use it to call async func + # (extract_ordinance_text_with_llm is an async function) + extrct = extract_ordinance_text_with_llm(doc, text_splitter, extractor) + doc = ARun.run(services, extrct) - # 3) Build coroutine and use it to call async func in one go - doc = ARun.run(services, extract_ordinance_values(doc, **kwargs)) + # 3) Build coroutine and use it to call async func in one go + doc = ARun.run(services, extract_ordinance_values(doc, **kwargs)) - # save outputs - doc.metadata['ordinance_values'].to_csv(fp_ords) - with open(fp_txt_all, 'w') as f: - f.write(doc.metadata["ordinance_text"]) - with open(fp_txt_clean, 'w') as f: - f.write(doc.metadata["cleaned_ordinance_text"]) + # save outputs + doc.metadata['ordinance_values'].to_csv(fp_ords) + with open(fp_txt_all, 'w') as f: + f.write(doc.metadata["ordinance_text"]) + with open(fp_txt_clean, 'w') as f: + f.write(doc.metadata["cleaned_ordinance_text"])