Skip to content

Commit

Permalink
Added example run script parse_pdf.py and started debugging
Browse files Browse the repository at this point in the history
  • Loading branch information
grantbuster committed Jul 3, 2024
1 parent 906eeba commit a3e3464
Show file tree
Hide file tree
Showing 7 changed files with 139 additions and 23 deletions.
6 changes: 6 additions & 0 deletions elm/ords/extraction/ordinance.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ async def parse(self, min_chunks_to_process=3):
if not is_legal_text:
logger.debug("Text at ind %d is not legal text", ind)
continue
else:
logger.debug("Text at ind %d is legal text", ind)

contains_ord_info = await self.parse_from_ind(
ind, self.CONTAINS_ORD_PROMPT, key="contains_ord_info"
Expand All @@ -181,6 +183,8 @@ async def parse(self, min_chunks_to_process=3):
"Text at ind %d does not contain ordinance info", ind
)
continue
else:
logger.debug("Text at ind %d does contain ordinance info", ind)

is_utility_scale = await self.parse_from_ind(
ind, self.IS_UTILITY_SCALE_PROMPT, key="x"
Expand All @@ -190,6 +194,8 @@ async def parse(self, min_chunks_to_process=3):
"Text at ind %d is not for utility-scale WECS", ind
)
continue
else:
logger.debug("Text at ind %d is for utility-scale WECS", ind)

self._ordinance_chunks.append({"text": text, "ind": ind})
logger.debug("Added text at ind %d to ordinances", ind)
Expand Down
13 changes: 7 additions & 6 deletions elm/ords/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ async def _process_with_logs(
):
"""Process counties with logging enabled."""
counties = _load_counties_to_process(county_fp)
azure_api_key, azure_version, azure_endpoint = _validate_api_params(
azure_api_key, azure_version, azure_endpoint = validate_api_params(
azure_api_key, azure_version, azure_endpoint
)

Expand Down Expand Up @@ -318,7 +318,7 @@ async def _process_with_logs(
)
trackers.append(usage_tracker)
task = asyncio.create_task(
download_docs_for_county_with_logging(
process_county_with_logging(
log_listener,
log_dir,
location,
Expand Down Expand Up @@ -379,7 +379,8 @@ def _load_counties_to_process(county_fp):
return load_counties_from_fp(county_fp)


def _validate_api_params(azure_api_key, azure_version, azure_endpoint):
def validate_api_params(azure_api_key=None, azure_version=None,
azure_endpoint=None):
"""Validate OpenAI API parameters."""
azure_api_key = azure_api_key or os.environ.get("AZURE_OPENAI_API_KEY")
azure_version = azure_version or os.environ.get("AZURE_OPENAI_VERSION")
Expand All @@ -404,7 +405,7 @@ def _configure_file_loader_kwargs(file_loader_kwargs):
return file_loader_kwargs


async def download_docs_for_county_with_logging(
async def process_county_with_logging(
listener,
log_dir,
county,
Expand Down Expand Up @@ -461,7 +462,7 @@ async def download_docs_for_county_with_logging(
listener, log_dir, location=county.full_name, level=level
):
task = asyncio.create_task(
download_doc_for_county(
process_county(
county,
text_splitter,
num_urls=num_urls,
Expand All @@ -485,7 +486,7 @@ async def download_docs_for_county_with_logging(
return doc


async def download_doc_for_county(
async def process_county(
county,
text_splitter,
num_urls=5,
Expand Down
2 changes: 1 addition & 1 deletion elm/ords/validation/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,13 @@ async def parse_from_ind(self, ind, prompt, key):
logger.debug("Mem at ind %d is %s", step, mem)
check = mem.get(key)
if check is None:
# logger.debug("text=%s", text)
content = await self.slc.call(
sys_msg=prompt.format(key=key),
content=text,
usage_sub_label="document_content_validation",
)
check = mem[key] = content.get(key, False)
logger.info(f'Successfully called GPT! Check: {check}, ind: {ind}, key: {key}')
if check:
return check
return False
Expand Down
26 changes: 16 additions & 10 deletions elm/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,15 +336,19 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n',
self.full = combine_pages(self.pages)
return self.full

def convert_to_txt(self, txt_fp, separator=' '):
"""Function to convert contents of pdf document to txt file.
def convert_to_txt(self, txt_fp=None, separator=' ',
clean_header_kwargs=None):
"""Function to convert contents of pdf document to txt file using
poppler.
Parameters
----------
txt_fp: str
Directory for output txt file.
txt_fp: str | optional
Optional Directory for output txt file.
separator : str
Heuristic split string to look for spaces between columns
clean_headers : dict | None
Optional kwargs to override clean_headers kwargs
Returns
-------
Expand All @@ -354,11 +358,13 @@ def convert_to_txt(self, txt_fp, separator=' '):
text = self.clean_poppler(layout=True)
if is_multi_col(text, separator=separator):
text = self.clean_poppler(layout=False)
text = self.clean_headers(char_thresh=0.6, page_thresh=0.8,
split_on='\n',
iheaders=[0, 1, 3, -3, -2, -1])
with open(txt_fp, 'w') as f:
f.write(text)
logger.info(f'Saved: {txt_fp}')

clean_header_kwargs = clean_header_kwargs or {}
text = self.clean_headers(**clean_header_kwargs)

if txt_fp is not None:
with open(txt_fp, 'w') as f:
f.write(text)
logger.info(f'Saved: {txt_fp}')

return text
21 changes: 21 additions & 0 deletions elm/web/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
html_to_text,
remove_blank_pages,
format_html_tables,
read_pdf,
replace_common_pdf_conversion_chars,
replace_multi_dot_lines,
remove_empty_lines_or_page_footers,
Expand Down Expand Up @@ -173,6 +174,26 @@ def _raw_pages(self):
raw_pages += [page for page in self.pages[self._last_page_index:]]
return raw_pages

@classmethod
def from_file(cls, fp, **init_kwargs):
"""Initialize a PDFDocument object from a .pdf file on disk.
Parameters
----------
fp : str
filepath to .pdf on disk
init_kwargs : dict
Optional kwargs for PDFDocument Initialization
Returns
-------
out : PDFDocument
Initialized PDFDocument class from input fp
"""
with open(fp, 'rb') as f:
pages = read_pdf(f.read())
return cls(pages, **init_kwargs)


class HTMLDocument(BaseDocument):
"""ELM web HTML document"""
Expand Down
11 changes: 5 additions & 6 deletions examples/ordinance_gpt/config.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"out_dir": ".",
"county_fp": "counties.csv",
"county_fp": "counties_single.csv",
"model": "gpt-4",
"azure_api_key": "<ADD AZURE OPENAI API KEY HERE>",
"azure_version": "<ADD AZURE OPENAI VERSION HERE>",
"azure_endpoint": "<ADD AZURE OPENAI ENDPOINT HERE>",
"azure_api_key": "22b68a1172af4607ab4faf1fa1b25289",
"azure_version": "2023-03-15-preview",
"azure_endpoint": "https://stratus-embeddings-south-central.openai.azure.com/",
"llm_call_kwargs":{
"temperature": 0,
"seed": 42,
Expand All @@ -20,6 +20,5 @@
"ppe_kwargs": {
"max_workers": 4
},
"pytesseract_exe_fp": "<Add tesseract.exe PATH HERE OR REMOVE THIS KEY>",
"log_level": "INFO"
}
}
83 changes: 83 additions & 0 deletions examples/ordinance_gpt/parse_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import threading
import asyncio
from elm.base import ApiBase
from elm.pdf import PDFtoTXT
from elm.chunk import Chunker
from elm.web.document import PDFDocument
from functools import partial
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from elm.ords.services.queues import initialize_service_queue
from elm.ords.services.openai import OpenAIService
from elm.ords.utilities import RTS_SEPARATORS
from elm.ords.process import validate_api_params
from elm.utilities.parse import read_pdf
from elm.ords.services.provider import RunningAsyncServices
from rex import init_logger

from elm.ords.extraction.apply import check_for_ordinance_info


async def run(services, doc, text_splitter, **kwargs):
async with RunningAsyncServices(services):
doc = await check_for_ordinance_info(doc, text_splitter, **kwargs)
return doc


if __name__ == '__main__':
init_logger(__name__, log_level='DEBUG')
init_logger('elm', log_level='DEBUG')

fp = './county_ord_files/Box Elder County, Utah.pdf'

text_splitter = RecursiveCharacterTextSplitter(
RTS_SEPARATORS,
chunk_size=3000,
chunk_overlap=300,
length_function=partial(ApiBase.count_tokens, model='gpt-4'),
)

doc = PDFDocument.from_file(fp)

azure_api_key, azure_version, azure_endpoint = validate_api_params()
client = openai.AsyncAzureOpenAI(api_key=azure_api_key,
api_version=azure_version,
azure_endpoint=azure_endpoint)
llm_service = OpenAIService(client, rate_limit=1e9)
initialize_service_queue(llm_service.__class__.__name__)
services = [llm_service]

kwargs = dict(llm_service=llm_service, model='gpt-4', temperature=0,
max_tokens=1000)

doc = asyncio.run(run(services, doc, text_splitter, **kwargs))

breakpoint()
raise
#doc = asyncio.run(check_for_ordinance_info(doc, text_splitter, **kwargs))

#print(doc.metadata["ordinance_text"])
#breakpoint()
#raise

# kwargs = dict(model="gpt-4",
# usage_tracker=None,
# usage_sub_label='document_content_validation',
# messages=[
# {"role": "system", "content": "You are a helpful assistant."},
# {"role": "user", "content": "Hello!"}
# ],
# temperature=0,
# max_tokens=1000)
#asyncio.run(llm_service(


# kwargs = dict(model="gpt-4",
# messages=[
# {"role": "system", "content": "You are a helpful assistant."},
# {"role": "user", "content": "Hello!"}
# ],
# temperature=0,
# max_tokens=1000)
# out = asyncio.run(llm_service._call_gpt(**kwargs))
# print(out)

0 comments on commit a3e3464

Please sign in to comment.