removed iter from parse pdf

NREL · Jul 10, 2024 · 1043602 · 1043602
1 parent 5437720
commit 1043602
Showing 1 changed file with 39 additions and 40 deletions.
diff --git a/examples/ordinance_gpt/parse_pdf.py b/examples/ordinance_gpt/parse_pdf.py
@@ -24,51 +24,50 @@
     # download this from https://app.box.com/s/a8oi8jotb9vnu55rzdul7e291jnn7hmq
     fp_pdf = 'Palo Alto Iowa.pdf'
 
-    for idx in range(10):
-        fp_txt_all = fp_pdf.replace('.pdf', f'_all_{idx}.txt')
-        fp_txt_clean = fp_pdf.replace('.pdf', f'_clean_{idx}.txt')
-        fp_ords = fp_pdf.replace('.pdf', f'_ords_{idx}.csv')
+    fp_txt_all = fp_pdf.replace('.pdf', '_all.txt')
+    fp_txt_clean = fp_pdf.replace('.pdf', '_clean.txt')
+    fp_ords = fp_pdf.replace('.pdf', '_ords.csv')
 
-        doc = PDFDocument.from_file(fp_pdf)
+    doc = PDFDocument.from_file(fp_pdf)
 
-        text_splitter = RecursiveCharacterTextSplitter(
-            RTS_SEPARATORS,
-            chunk_size=3000,
-            chunk_overlap=300,
-            length_function=partial(ApiBase.count_tokens, model='gpt-4'),
-        )
+    text_splitter = RecursiveCharacterTextSplitter(
+        RTS_SEPARATORS,
+        chunk_size=3000,
+        chunk_overlap=300,
+        length_function=partial(ApiBase.count_tokens, model='gpt-4'),
+    )
 
-        # setup LLM and Ordinance service/utility classes
-        azure_api_key, azure_version, azure_endpoint = validate_api_params()
-        client = openai.AsyncAzureOpenAI(api_key=azure_api_key,
-                                         api_version=azure_version,
-                                         azure_endpoint=azure_endpoint)
-        llm_service = OpenAIService(client, rate_limit=1e9)
-        services = [llm_service]
-        kwargs = dict(llm_service=llm_service, model='gpt-4', temperature=0)
-        extractor = OrdinanceExtractor(LLMCaller(**kwargs))
+    # setup LLM and Ordinance service/utility classes
+    azure_api_key, azure_version, azure_endpoint = validate_api_params()
+    client = openai.AsyncAzureOpenAI(api_key=azure_api_key,
+                                     api_version=azure_version,
+                                     azure_endpoint=azure_endpoint)
+    llm_service = OpenAIService(client, rate_limit=1e9)
+    services = [llm_service]
+    kwargs = dict(llm_service=llm_service, model='gpt-4', temperature=0)
+    extractor = OrdinanceExtractor(LLMCaller(**kwargs))
 
-        """The following three function calls present three (equivalent) ways
-        to call ELM async ordinance functions. The three functions 1) check
-        ordinance documents for relevant ordinance info, 2) extract the
-        relevant text, and 3) run the decision tree to get structured ordinance
-        data from the unstructured legal text."""
+    """The following three function calls present three (equivalent) ways to
+    call ELM async ordinance functions. The three functions 1) check ordinance
+    documents for relevant ordinance info, 2) extract the relevant text, and 3)
+    run the decision tree to get structured ordinance data from the
+    unstructured legal text."""
 
-        # 1) call async func using a partial function (`run_async`)
-        run_async = partial(ARun.run, services)
-        doc = run_async(check_for_ordinance_info(doc, text_splitter, **kwargs))
+    # 1) call async func using a partial function (`run_async`)
+    run_async = partial(ARun.run, services)
+    doc = run_async(check_for_ordinance_info(doc, text_splitter, **kwargs))
 
-        # 2) Build coroutine first the use it to call async func
-        # (extract_ordinance_text_with_llm is an async function)
-        extrct = extract_ordinance_text_with_llm(doc, text_splitter, extractor)
-        doc = ARun.run(services, extrct)
+    # 2) Build coroutine first the use it to call async func
+    # (extract_ordinance_text_with_llm is an async function)
+    extrct = extract_ordinance_text_with_llm(doc, text_splitter, extractor)
+    doc = ARun.run(services, extrct)
 
-        # 3) Build coroutine and use it to call async func in one go
-        doc = ARun.run(services, extract_ordinance_values(doc, **kwargs))
+    # 3) Build coroutine and use it to call async func in one go
+    doc = ARun.run(services, extract_ordinance_values(doc, **kwargs))
 
-        # save outputs
-        doc.metadata['ordinance_values'].to_csv(fp_ords)
-        with open(fp_txt_all, 'w') as f:
-            f.write(doc.metadata["ordinance_text"])
-        with open(fp_txt_clean, 'w') as f:
-            f.write(doc.metadata["cleaned_ordinance_text"])
+    # save outputs
+    doc.metadata['ordinance_values'].to_csv(fp_ords)
+    with open(fp_txt_all, 'w') as f:
+        f.write(doc.metadata["ordinance_text"])
+    with open(fp_txt_clean, 'w') as f:
+        f.write(doc.metadata["cleaned_ordinance_text"])