Added example run script parse_pdf.py and started debugging

NREL · Jul 3, 2024 · a3e3464 · a3e3464
1 parent 906eeba
commit a3e3464
Show file tree

Hide file tree

Showing 7 changed files with 139 additions and 23 deletions.
diff --git a/elm/ords/extraction/ordinance.py b/elm/ords/extraction/ordinance.py
@@ -172,6 +172,8 @@ async def parse(self, min_chunks_to_process=3):
                 if not is_legal_text:
                     logger.debug("Text at ind %d is not legal text", ind)
                     continue
+                else:
+                    logger.debug("Text at ind %d is legal text", ind)
 
             contains_ord_info = await self.parse_from_ind(
                 ind, self.CONTAINS_ORD_PROMPT, key="contains_ord_info"
@@ -181,6 +183,8 @@ async def parse(self, min_chunks_to_process=3):
                     "Text at ind %d does not contain ordinance info", ind
                 )
                 continue
+            else:
+                logger.debug("Text at ind %d does contain ordinance info", ind)
 
             is_utility_scale = await self.parse_from_ind(
                 ind, self.IS_UTILITY_SCALE_PROMPT, key="x"
@@ -190,6 +194,8 @@ async def parse(self, min_chunks_to_process=3):
                     "Text at ind %d is not for utility-scale WECS", ind
                 )
                 continue
+            else:
+                logger.debug("Text at ind %d is for utility-scale WECS", ind)
 
             self._ordinance_chunks.append({"text": text, "ind": ind})
             logger.debug("Added text at ind %d to ordinances", ind)

diff --git a/elm/ords/process.py b/elm/ords/process.py
@@ -269,7 +269,7 @@ async def _process_with_logs(
 ):
     """Process counties with logging enabled."""
     counties = _load_counties_to_process(county_fp)
-    azure_api_key, azure_version, azure_endpoint = _validate_api_params(
+    azure_api_key, azure_version, azure_endpoint = validate_api_params(
         azure_api_key, azure_version, azure_endpoint
     )
 
@@ -318,7 +318,7 @@ async def _process_with_logs(
             )
             trackers.append(usage_tracker)
             task = asyncio.create_task(
-                download_docs_for_county_with_logging(
+                process_county_with_logging(
                     log_listener,
                     log_dir,
                     location,
@@ -379,7 +379,8 @@ def _load_counties_to_process(county_fp):
     return load_counties_from_fp(county_fp)
 
 
-def _validate_api_params(azure_api_key, azure_version, azure_endpoint):
+def validate_api_params(azure_api_key=None, azure_version=None,
+                        azure_endpoint=None):
     """Validate OpenAI API parameters."""
     azure_api_key = azure_api_key or os.environ.get("AZURE_OPENAI_API_KEY")
     azure_version = azure_version or os.environ.get("AZURE_OPENAI_VERSION")
@@ -404,7 +405,7 @@ def _configure_file_loader_kwargs(file_loader_kwargs):
     return file_loader_kwargs
 
 
-async def download_docs_for_county_with_logging(
+async def process_county_with_logging(
     listener,
     log_dir,
     county,
@@ -461,7 +462,7 @@ async def download_docs_for_county_with_logging(
         listener, log_dir, location=county.full_name, level=level
     ):
         task = asyncio.create_task(
-            download_doc_for_county(
+            process_county(
                 county,
                 text_splitter,
                 num_urls=num_urls,
@@ -485,7 +486,7 @@ async def download_docs_for_county_with_logging(
         return doc
 
 
-async def download_doc_for_county(
+async def process_county(
     county,
     text_splitter,
     num_urls=5,

diff --git a/elm/ords/validation/content.py b/elm/ords/validation/content.py
@@ -125,13 +125,13 @@ async def parse_from_ind(self, ind, prompt, key):
             logger.debug("Mem at ind %d is %s", step, mem)
             check = mem.get(key)
             if check is None:
-                # logger.debug("text=%s", text)
                 content = await self.slc.call(
                     sys_msg=prompt.format(key=key),
                     content=text,
                     usage_sub_label="document_content_validation",
                 )
                 check = mem[key] = content.get(key, False)
+                logger.info(f'Successfully called GPT! Check: {check}, ind: {ind}, key: {key}')
             if check:
                 return check
         return False

diff --git a/elm/pdf.py b/elm/pdf.py
@@ -336,15 +336,19 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n',
         self.full = combine_pages(self.pages)
         return self.full
 
-    def convert_to_txt(self, txt_fp, separator='    '):
-        """Function to convert contents of pdf document to txt file.
+    def convert_to_txt(self, txt_fp=None, separator='    ',
+                       clean_header_kwargs=None):
+        """Function to convert contents of pdf document to txt file using
+        poppler.
 
         Parameters
         ----------
-        txt_fp: str
-            Directory for output txt file.
+        txt_fp: str | optional
+            Optional Directory for output txt file.
         separator : str
             Heuristic split string to look for spaces between columns
+        clean_headers : dict | None
+            Optional kwargs to override clean_headers kwargs
 
         Returns
         -------
@@ -354,11 +358,13 @@ def convert_to_txt(self, txt_fp, separator='    '):
         text = self.clean_poppler(layout=True)
         if is_multi_col(text, separator=separator):
             text = self.clean_poppler(layout=False)
-        text = self.clean_headers(char_thresh=0.6, page_thresh=0.8,
-                                  split_on='\n',
-                                  iheaders=[0, 1, 3, -3, -2, -1])
-        with open(txt_fp, 'w') as f:
-            f.write(text)
-        logger.info(f'Saved: {txt_fp}')
+
+        clean_header_kwargs = clean_header_kwargs or {}
+        text = self.clean_headers(**clean_header_kwargs)
+
+        if txt_fp is not None:
+            with open(txt_fp, 'w') as f:
+                f.write(text)
+                logger.info(f'Saved: {txt_fp}')
 
         return text
diff --git a/elm/web/document.py b/elm/web/document.py
@@ -10,6 +10,7 @@
     html_to_text,
     remove_blank_pages,
     format_html_tables,
+    read_pdf,
     replace_common_pdf_conversion_chars,
     replace_multi_dot_lines,
     remove_empty_lines_or_page_footers,
@@ -173,6 +174,26 @@ def _raw_pages(self):
             raw_pages += [page for page in self.pages[self._last_page_index:]]
         return raw_pages
 
+    @classmethod
+    def from_file(cls, fp, **init_kwargs):
+        """Initialize a PDFDocument object from a .pdf file on disk.
+
+        Parameters
+        ----------
+        fp : str
+            filepath to .pdf on disk
+        init_kwargs : dict
+            Optional kwargs for PDFDocument Initialization
+
+        Returns
+        -------
+        out : PDFDocument
+            Initialized PDFDocument class from input fp
+        """
+        with open(fp, 'rb') as f:
+            pages = read_pdf(f.read())
+        return cls(pages, **init_kwargs)
+
 
 class HTMLDocument(BaseDocument):
     """ELM web HTML document"""

diff --git a/examples/ordinance_gpt/config.json b/examples/ordinance_gpt/config.json
@@ -1,10 +1,10 @@
 {
     "out_dir": ".",
-    "county_fp": "counties.csv",
+    "county_fp": "counties_single.csv",
     "model": "gpt-4",
-    "azure_api_key": "<ADD AZURE OPENAI API KEY HERE>",
-    "azure_version": "<ADD AZURE OPENAI VERSION HERE>",
-    "azure_endpoint": "<ADD AZURE OPENAI ENDPOINT HERE>",
+    "azure_api_key": "22b68a1172af4607ab4faf1fa1b25289",
+    "azure_version": "2023-03-15-preview",
+    "azure_endpoint": "https://stratus-embeddings-south-central.openai.azure.com/",
     "llm_call_kwargs":{
         "temperature": 0,
         "seed": 42,
@@ -20,6 +20,5 @@
     "ppe_kwargs": {
         "max_workers": 4
     },
-    "pytesseract_exe_fp": "<Add tesseract.exe PATH HERE OR REMOVE THIS KEY>",
     "log_level": "INFO"
-}
+}
diff --git a/examples/ordinance_gpt/parse_pdf.py b/examples/ordinance_gpt/parse_pdf.py
@@ -0,0 +1,83 @@
+import threading
+import asyncio
+from elm.base import ApiBase
+from elm.pdf import PDFtoTXT
+from elm.chunk import Chunker
+from elm.web.document import PDFDocument
+from functools import partial
+import openai
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from elm.ords.services.queues import initialize_service_queue
+from elm.ords.services.openai import OpenAIService
+from elm.ords.utilities import RTS_SEPARATORS
+from elm.ords.process import validate_api_params
+from elm.utilities.parse import read_pdf
+from elm.ords.services.provider import RunningAsyncServices
+from rex import init_logger
+
+from elm.ords.extraction.apply import check_for_ordinance_info
+
+
+async def run(services, doc, text_splitter, **kwargs):
+    async with RunningAsyncServices(services):
+        doc = await check_for_ordinance_info(doc, text_splitter, **kwargs)
+    return doc
+
+
+if __name__ == '__main__':
+    init_logger(__name__, log_level='DEBUG')
+    init_logger('elm', log_level='DEBUG')
+
+    fp = './county_ord_files/Box Elder County, Utah.pdf'
+
+    text_splitter = RecursiveCharacterTextSplitter(
+        RTS_SEPARATORS,
+        chunk_size=3000,
+        chunk_overlap=300,
+        length_function=partial(ApiBase.count_tokens, model='gpt-4'),
+    )
+
+    doc = PDFDocument.from_file(fp)
+
+    azure_api_key, azure_version, azure_endpoint = validate_api_params()
+    client = openai.AsyncAzureOpenAI(api_key=azure_api_key,
+                                     api_version=azure_version,
+                                     azure_endpoint=azure_endpoint)
+    llm_service = OpenAIService(client, rate_limit=1e9)
+    initialize_service_queue(llm_service.__class__.__name__)
+    services = [llm_service]
+
+    kwargs = dict(llm_service=llm_service, model='gpt-4', temperature=0,
+                  max_tokens=1000)
+
+    doc = asyncio.run(run(services, doc, text_splitter, **kwargs))
+
+    breakpoint()
+    raise
+    #doc = asyncio.run(check_for_ordinance_info(doc, text_splitter, **kwargs))
+
+    #print(doc.metadata["ordinance_text"])
+    #breakpoint()
+    #raise
+
+#    kwargs = dict(model="gpt-4",
+#                  usage_tracker=None,
+#                  usage_sub_label='document_content_validation',
+#                  messages=[
+#                      {"role": "system", "content": "You are a helpful assistant."},
+#                      {"role": "user", "content": "Hello!"}
+#                    ],
+#                  temperature=0,
+#                  max_tokens=1000)
+    #asyncio.run(llm_service(
+
+
+#    kwargs = dict(model="gpt-4",
+#                  messages=[
+#                      {"role": "system", "content": "You are a helpful assistant."},
+#                      {"role": "user", "content": "Hello!"}
+#                    ],
+#                  temperature=0,
+#                  max_tokens=1000)
+#    out = asyncio.run(llm_service._call_gpt(**kwargs))
+#    print(out)