From a3e346458968ad5ff903ee58475c4d22c24132ab Mon Sep 17 00:00:00 2001 From: grantbuster Date: Wed, 3 Jul 2024 15:45:55 -0600 Subject: [PATCH] Added example run script parse_pdf.py and started debugging --- elm/ords/extraction/ordinance.py | 6 +++ elm/ords/process.py | 13 ++--- elm/ords/validation/content.py | 2 +- elm/pdf.py | 26 +++++---- elm/web/document.py | 21 ++++++++ examples/ordinance_gpt/config.json | 11 ++-- examples/ordinance_gpt/parse_pdf.py | 83 +++++++++++++++++++++++++++++ 7 files changed, 139 insertions(+), 23 deletions(-) create mode 100644 examples/ordinance_gpt/parse_pdf.py diff --git a/elm/ords/extraction/ordinance.py b/elm/ords/extraction/ordinance.py index a77e832..a2d4402 100644 --- a/elm/ords/extraction/ordinance.py +++ b/elm/ords/extraction/ordinance.py @@ -172,6 +172,8 @@ async def parse(self, min_chunks_to_process=3): if not is_legal_text: logger.debug("Text at ind %d is not legal text", ind) continue + else: + logger.debug("Text at ind %d is legal text", ind) contains_ord_info = await self.parse_from_ind( ind, self.CONTAINS_ORD_PROMPT, key="contains_ord_info" @@ -181,6 +183,8 @@ async def parse(self, min_chunks_to_process=3): "Text at ind %d does not contain ordinance info", ind ) continue + else: + logger.debug("Text at ind %d does contain ordinance info", ind) is_utility_scale = await self.parse_from_ind( ind, self.IS_UTILITY_SCALE_PROMPT, key="x" @@ -190,6 +194,8 @@ async def parse(self, min_chunks_to_process=3): "Text at ind %d is not for utility-scale WECS", ind ) continue + else: + logger.debug("Text at ind %d is for utility-scale WECS", ind) self._ordinance_chunks.append({"text": text, "ind": ind}) logger.debug("Added text at ind %d to ordinances", ind) diff --git a/elm/ords/process.py b/elm/ords/process.py index acdc368..e375098 100644 --- a/elm/ords/process.py +++ b/elm/ords/process.py @@ -269,7 +269,7 @@ async def _process_with_logs( ): """Process counties with logging enabled.""" counties = _load_counties_to_process(county_fp) - azure_api_key, azure_version, azure_endpoint = _validate_api_params( + azure_api_key, azure_version, azure_endpoint = validate_api_params( azure_api_key, azure_version, azure_endpoint ) @@ -318,7 +318,7 @@ async def _process_with_logs( ) trackers.append(usage_tracker) task = asyncio.create_task( - download_docs_for_county_with_logging( + process_county_with_logging( log_listener, log_dir, location, @@ -379,7 +379,8 @@ def _load_counties_to_process(county_fp): return load_counties_from_fp(county_fp) -def _validate_api_params(azure_api_key, azure_version, azure_endpoint): +def validate_api_params(azure_api_key=None, azure_version=None, + azure_endpoint=None): """Validate OpenAI API parameters.""" azure_api_key = azure_api_key or os.environ.get("AZURE_OPENAI_API_KEY") azure_version = azure_version or os.environ.get("AZURE_OPENAI_VERSION") @@ -404,7 +405,7 @@ def _configure_file_loader_kwargs(file_loader_kwargs): return file_loader_kwargs -async def download_docs_for_county_with_logging( +async def process_county_with_logging( listener, log_dir, county, @@ -461,7 +462,7 @@ async def download_docs_for_county_with_logging( listener, log_dir, location=county.full_name, level=level ): task = asyncio.create_task( - download_doc_for_county( + process_county( county, text_splitter, num_urls=num_urls, @@ -485,7 +486,7 @@ async def download_docs_for_county_with_logging( return doc -async def download_doc_for_county( +async def process_county( county, text_splitter, num_urls=5, diff --git a/elm/ords/validation/content.py b/elm/ords/validation/content.py index 4355579..70cadea 100644 --- a/elm/ords/validation/content.py +++ b/elm/ords/validation/content.py @@ -125,13 +125,13 @@ async def parse_from_ind(self, ind, prompt, key): logger.debug("Mem at ind %d is %s", step, mem) check = mem.get(key) if check is None: - # logger.debug("text=%s", text) content = await self.slc.call( sys_msg=prompt.format(key=key), content=text, usage_sub_label="document_content_validation", ) check = mem[key] = content.get(key, False) + logger.info(f'Successfully called GPT! Check: {check}, ind: {ind}, key: {key}') if check: return check return False diff --git a/elm/pdf.py b/elm/pdf.py index 0f05630..b53d678 100644 --- a/elm/pdf.py +++ b/elm/pdf.py @@ -336,15 +336,19 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n', self.full = combine_pages(self.pages) return self.full - def convert_to_txt(self, txt_fp, separator=' '): - """Function to convert contents of pdf document to txt file. + def convert_to_txt(self, txt_fp=None, separator=' ', + clean_header_kwargs=None): + """Function to convert contents of pdf document to txt file using + poppler. Parameters ---------- - txt_fp: str - Directory for output txt file. + txt_fp: str | optional + Optional Directory for output txt file. separator : str Heuristic split string to look for spaces between columns + clean_headers : dict | None + Optional kwargs to override clean_headers kwargs Returns ------- @@ -354,11 +358,13 @@ def convert_to_txt(self, txt_fp, separator=' '): text = self.clean_poppler(layout=True) if is_multi_col(text, separator=separator): text = self.clean_poppler(layout=False) - text = self.clean_headers(char_thresh=0.6, page_thresh=0.8, - split_on='\n', - iheaders=[0, 1, 3, -3, -2, -1]) - with open(txt_fp, 'w') as f: - f.write(text) - logger.info(f'Saved: {txt_fp}') + + clean_header_kwargs = clean_header_kwargs or {} + text = self.clean_headers(**clean_header_kwargs) + + if txt_fp is not None: + with open(txt_fp, 'w') as f: + f.write(text) + logger.info(f'Saved: {txt_fp}') return text diff --git a/elm/web/document.py b/elm/web/document.py index 67ba17c..7d7b744 100644 --- a/elm/web/document.py +++ b/elm/web/document.py @@ -10,6 +10,7 @@ html_to_text, remove_blank_pages, format_html_tables, + read_pdf, replace_common_pdf_conversion_chars, replace_multi_dot_lines, remove_empty_lines_or_page_footers, @@ -173,6 +174,26 @@ def _raw_pages(self): raw_pages += [page for page in self.pages[self._last_page_index:]] return raw_pages + @classmethod + def from_file(cls, fp, **init_kwargs): + """Initialize a PDFDocument object from a .pdf file on disk. + + Parameters + ---------- + fp : str + filepath to .pdf on disk + init_kwargs : dict + Optional kwargs for PDFDocument Initialization + + Returns + ------- + out : PDFDocument + Initialized PDFDocument class from input fp + """ + with open(fp, 'rb') as f: + pages = read_pdf(f.read()) + return cls(pages, **init_kwargs) + class HTMLDocument(BaseDocument): """ELM web HTML document""" diff --git a/examples/ordinance_gpt/config.json b/examples/ordinance_gpt/config.json index 37767ec..7a62d7c 100644 --- a/examples/ordinance_gpt/config.json +++ b/examples/ordinance_gpt/config.json @@ -1,10 +1,10 @@ { "out_dir": ".", - "county_fp": "counties.csv", + "county_fp": "counties_single.csv", "model": "gpt-4", - "azure_api_key": "", - "azure_version": "", - "azure_endpoint": "", + "azure_api_key": "22b68a1172af4607ab4faf1fa1b25289", + "azure_version": "2023-03-15-preview", + "azure_endpoint": "https://stratus-embeddings-south-central.openai.azure.com/", "llm_call_kwargs":{ "temperature": 0, "seed": 42, @@ -20,6 +20,5 @@ "ppe_kwargs": { "max_workers": 4 }, - "pytesseract_exe_fp": "", "log_level": "INFO" -} \ No newline at end of file +} diff --git a/examples/ordinance_gpt/parse_pdf.py b/examples/ordinance_gpt/parse_pdf.py new file mode 100644 index 0000000..e2d57ab --- /dev/null +++ b/examples/ordinance_gpt/parse_pdf.py @@ -0,0 +1,83 @@ +import threading +import asyncio +from elm.base import ApiBase +from elm.pdf import PDFtoTXT +from elm.chunk import Chunker +from elm.web.document import PDFDocument +from functools import partial +import openai +from langchain.text_splitter import RecursiveCharacterTextSplitter +from elm.ords.services.queues import initialize_service_queue +from elm.ords.services.openai import OpenAIService +from elm.ords.utilities import RTS_SEPARATORS +from elm.ords.process import validate_api_params +from elm.utilities.parse import read_pdf +from elm.ords.services.provider import RunningAsyncServices +from rex import init_logger + +from elm.ords.extraction.apply import check_for_ordinance_info + + +async def run(services, doc, text_splitter, **kwargs): + async with RunningAsyncServices(services): + doc = await check_for_ordinance_info(doc, text_splitter, **kwargs) + return doc + + +if __name__ == '__main__': + init_logger(__name__, log_level='DEBUG') + init_logger('elm', log_level='DEBUG') + + fp = './county_ord_files/Box Elder County, Utah.pdf' + + text_splitter = RecursiveCharacterTextSplitter( + RTS_SEPARATORS, + chunk_size=3000, + chunk_overlap=300, + length_function=partial(ApiBase.count_tokens, model='gpt-4'), + ) + + doc = PDFDocument.from_file(fp) + + azure_api_key, azure_version, azure_endpoint = validate_api_params() + client = openai.AsyncAzureOpenAI(api_key=azure_api_key, + api_version=azure_version, + azure_endpoint=azure_endpoint) + llm_service = OpenAIService(client, rate_limit=1e9) + initialize_service_queue(llm_service.__class__.__name__) + services = [llm_service] + + kwargs = dict(llm_service=llm_service, model='gpt-4', temperature=0, + max_tokens=1000) + + doc = asyncio.run(run(services, doc, text_splitter, **kwargs)) + + breakpoint() + raise + #doc = asyncio.run(check_for_ordinance_info(doc, text_splitter, **kwargs)) + + #print(doc.metadata["ordinance_text"]) + #breakpoint() + #raise + +# kwargs = dict(model="gpt-4", +# usage_tracker=None, +# usage_sub_label='document_content_validation', +# messages=[ +# {"role": "system", "content": "You are a helpful assistant."}, +# {"role": "user", "content": "Hello!"} +# ], +# temperature=0, +# max_tokens=1000) + #asyncio.run(llm_service( + + +# kwargs = dict(model="gpt-4", +# messages=[ +# {"role": "system", "content": "You are a helpful assistant."}, +# {"role": "user", "content": "Hello!"} +# ], +# temperature=0, +# max_tokens=1000) +# out = asyncio.run(llm_service._call_gpt(**kwargs)) +# print(out)