Skip to content

Commit

Permalink
removed iter from parse pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
grantbuster committed Jul 10, 2024
1 parent 5437720 commit 1043602
Showing 1 changed file with 39 additions and 40 deletions.
79 changes: 39 additions & 40 deletions examples/ordinance_gpt/parse_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,51 +24,50 @@
# download this from https://app.box.com/s/a8oi8jotb9vnu55rzdul7e291jnn7hmq
fp_pdf = 'Palo Alto Iowa.pdf'

for idx in range(10):
fp_txt_all = fp_pdf.replace('.pdf', f'_all_{idx}.txt')
fp_txt_clean = fp_pdf.replace('.pdf', f'_clean_{idx}.txt')
fp_ords = fp_pdf.replace('.pdf', f'_ords_{idx}.csv')
fp_txt_all = fp_pdf.replace('.pdf', '_all.txt')
fp_txt_clean = fp_pdf.replace('.pdf', '_clean.txt')
fp_ords = fp_pdf.replace('.pdf', '_ords.csv')

doc = PDFDocument.from_file(fp_pdf)
doc = PDFDocument.from_file(fp_pdf)

text_splitter = RecursiveCharacterTextSplitter(
RTS_SEPARATORS,
chunk_size=3000,
chunk_overlap=300,
length_function=partial(ApiBase.count_tokens, model='gpt-4'),
)
text_splitter = RecursiveCharacterTextSplitter(
RTS_SEPARATORS,
chunk_size=3000,
chunk_overlap=300,
length_function=partial(ApiBase.count_tokens, model='gpt-4'),
)

# setup LLM and Ordinance service/utility classes
azure_api_key, azure_version, azure_endpoint = validate_api_params()
client = openai.AsyncAzureOpenAI(api_key=azure_api_key,
api_version=azure_version,
azure_endpoint=azure_endpoint)
llm_service = OpenAIService(client, rate_limit=1e9)
services = [llm_service]
kwargs = dict(llm_service=llm_service, model='gpt-4', temperature=0)
extractor = OrdinanceExtractor(LLMCaller(**kwargs))
# setup LLM and Ordinance service/utility classes
azure_api_key, azure_version, azure_endpoint = validate_api_params()
client = openai.AsyncAzureOpenAI(api_key=azure_api_key,
api_version=azure_version,
azure_endpoint=azure_endpoint)
llm_service = OpenAIService(client, rate_limit=1e9)
services = [llm_service]
kwargs = dict(llm_service=llm_service, model='gpt-4', temperature=0)
extractor = OrdinanceExtractor(LLMCaller(**kwargs))

"""The following three function calls present three (equivalent) ways
to call ELM async ordinance functions. The three functions 1) check
ordinance documents for relevant ordinance info, 2) extract the
relevant text, and 3) run the decision tree to get structured ordinance
data from the unstructured legal text."""
"""The following three function calls present three (equivalent) ways to
call ELM async ordinance functions. The three functions 1) check ordinance
documents for relevant ordinance info, 2) extract the relevant text, and 3)
run the decision tree to get structured ordinance data from the
unstructured legal text."""

# 1) call async func using a partial function (`run_async`)
run_async = partial(ARun.run, services)
doc = run_async(check_for_ordinance_info(doc, text_splitter, **kwargs))
# 1) call async func using a partial function (`run_async`)
run_async = partial(ARun.run, services)
doc = run_async(check_for_ordinance_info(doc, text_splitter, **kwargs))

# 2) Build coroutine first the use it to call async func
# (extract_ordinance_text_with_llm is an async function)
extrct = extract_ordinance_text_with_llm(doc, text_splitter, extractor)
doc = ARun.run(services, extrct)
# 2) Build coroutine first the use it to call async func
# (extract_ordinance_text_with_llm is an async function)
extrct = extract_ordinance_text_with_llm(doc, text_splitter, extractor)
doc = ARun.run(services, extrct)

# 3) Build coroutine and use it to call async func in one go
doc = ARun.run(services, extract_ordinance_values(doc, **kwargs))
# 3) Build coroutine and use it to call async func in one go
doc = ARun.run(services, extract_ordinance_values(doc, **kwargs))

# save outputs
doc.metadata['ordinance_values'].to_csv(fp_ords)
with open(fp_txt_all, 'w') as f:
f.write(doc.metadata["ordinance_text"])
with open(fp_txt_clean, 'w') as f:
f.write(doc.metadata["cleaned_ordinance_text"])
# save outputs
doc.metadata['ordinance_values'].to_csv(fp_ords)
with open(fp_txt_all, 'w') as f:
f.write(doc.metadata["ordinance_text"])
with open(fp_txt_clean, 'w') as f:
f.write(doc.metadata["cleaned_ordinance_text"])

0 comments on commit 1043602

Please sign in to comment.