Skip to content

Commit

Permalink
fix: combined id of docs that are used together added to atts; fixed … (
Browse files Browse the repository at this point in the history
#123)

* fix: combined id of docs that are used together added to atts; fixed earlier generation retrieval for multiple docs

* add check that there are multiple docs to get combination id; remove extra logs

* concatenated ids of files -> list of ids of files

* improve set_correct_type_and_id func for cases where no id is needed
  • Loading branch information
smilni authored Nov 16, 2023
1 parent 8705d0b commit 88b3c9a
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 93 deletions.
16 changes: 15 additions & 1 deletion annotators/doc_processor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,19 @@ Here is an example of what Document Processor may add to the dialog state:
{
"human": {
"attributes": {
"documents_in_use" = ["nlkr09lnvJ_7ed546db9846ba7661ceda123837f7fc", "kKmcdwiow9_7ed546db9846ba7661ceda123837f7fc"]
"documents_in_use" = ["nlkr09lnvJ_7ed546db9846ba7661ceda123837f7fc", "kKmcdwiow9_7ed546db9846ba7661ceda123837f7fc"],
"documents_combination_ids" = {
"LKNpck0nke_7ed546db9846ba7661ceda123837f7f":
[
"nlkr09lnvJ_7ed546db9846ba7661ceda123837f7fc",
"kKmcdwiow9_7ed546db9846ba7661ceda123837f7fc"
],
"kfmIOJkm9e_7ed546db9846ba7661ceda123837f7f":
[
"nlkr09lnvJ_7ed546db9846ba7661ceda123837f7fc",
"lrfmovor99_jdcn096db9846ba681ceda398kewn93"
]
}
"processed_documents" = {
"nlkr09lnvJ_7ed546db9846ba7661ceda123837f7fc":
{
Expand Down Expand Up @@ -39,6 +51,8 @@ Here is an example of what Document Processor may add to the dialog state:

`documents_in_use` are the documents that are being discussed on this step of the dialog. These are typically the documents specified in the attributes of the last human utterance or the arguments of doc-processor docker container.

`documents_combination_ids` is a dictionary mapping combination id with ids of documents that this combination includes. Each docs (2 and more) that were ever used together in `documents_in_use` have their own combination id and are stored in `documents_combination_ids`.

`processed_documents` are all documents that were given by the user during the dialog and processed by system with all the information available about these documents. `processed_documents` always include `documents_in_use` and may include previously discussed documents if there are any.

## Parameters
Expand Down
18 changes: 13 additions & 5 deletions annotators/doc_processor/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ def process_and_upload_doc():
bot_utts = dialog.get("bot_utterances", [])
docs_in_atts = human_utts[-1].get("attributes", {}).get("documents", [])
all_docs_info = dialog.get("human", {}).get("attributes", {}).get("processed_documents", {})
docs_combination_ids_info = (
dialog.get("human", {}).get("attributes", {}).get("documents_combination_ids", {})
)
docs_in_use_info = dialog.get("human", {}).get("attributes", {}).get("documents_in_use", [])
# even if we reset the dialog, we may still get some old files in docs_in_use
# thus, for a new dialog, we manually reset docs_in_use
Expand All @@ -46,17 +49,21 @@ def process_and_upload_doc():
# check if we got sth from attributes (docs_in_atts) or arguments (DOC_PATHS_OR_LINKS)
# if these docs were not processed yet, process them and upload to file server
# if these docs were already processed, just reset n_steps_discussed
new_docs_in_use_info, new_docs_info = upload_documents_save_info(
new_docs_in_use_info, new_docs_info, docs_combination_ids_new = upload_documents_save_info(
docs_in_atts, DOC_PATHS_OR_LINKS, all_docs_info, docs_in_use_info, dialog_id
)
# update dicts to be used in human_attributes with new info
# for docs_in_use, if we got new docs, forget the old ones
# update dicts to be used in human_attributes with new info for docs_in_use
# and new combination id for these new docs_in_use
# if we got new docs, remove the old ones from docs_in_use_info
if new_docs_in_use_info:
docs_in_use_info = new_docs_in_use_info
all_docs_info.update(new_docs_info)
docs_combination_ids_info.update(docs_combination_ids_new)
# only update attributes if we received some documents
if new_docs_info:
all_docs_info.update(new_docs_info)
logger.info("Received and processed new document(s).")
# if no new documents received, we can either leave the attributes as they are
# or in some cases clear active documents if we don't want to continue discussing them
else:
# check if document is being discussed for too long; if yes, clear docs_in_use_info
# do not check that if we have any document in attributes
Expand All @@ -73,12 +80,13 @@ def process_and_upload_doc():
docs_in_use_info.clear()
logger.info(
f"No skills using docs active for {N_TURNS_TO_KEEP_DOC} turns. \
Remove all docs from active memory."
Remove all docs from active memory."
)
human_atts = {
"human_attributes": {
"processed_documents": all_docs_info,
"documents_in_use": docs_in_use_info,
"documents_combination_ids": docs_combination_ids_info,
}
}
attributes_to_add.append(human_atts)
Expand Down
15 changes: 12 additions & 3 deletions annotators/doc_processor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def get_docs_to_process(all_docs_to_check: List[str], all_docs_info: dict, docs_

def upload_documents_save_info(
docs_in_atts: List[str], doc_paths_or_links: List[str], all_docs_info: dict, docs_in_use_info: dict, dialog_id: str
) -> Tuple[dict, dict]:
) -> Tuple[list, dict, dict]:
"""Processes the given documents to get plain text if they were not processed before,
uploads them to file server and returns information about each.
NB: If there are multiple documents, their text is concatenated and uploaded to server as one .txt file.
Expand All @@ -260,6 +260,12 @@ def upload_documents_save_info(
documents_in_use = ['nlkr09lnvJ_7ed546db9846ba7661ceda123837f7fc',
'kKmcdwiow9_7ed546db9846ba7661ceda123837f7fc']
A dictionary mapping combination id with ids of files currently in use:
docs_combination_ids = {
'LKNpck0nke_7ed546db9846ba7661ceda123837f7fc':
['nlkr09lnvJ_7ed546db9846ba7661ceda123837f7fc-kKmcdwiow9_7ed546db9846ba7661ceda123837f7fc']
}
Another one mapping ids of all files that were ever used and information about them, such as
file source and link to the file with processed text:
processed_documents = {
Expand All @@ -284,7 +290,7 @@ def upload_documents_save_info(
# (either fully unprocessed or processed sometime earlier but not yet present in current docs_in_use)
all_docs_to_check = list(set(docs_in_atts + doc_paths_or_links))
docs_and_types = get_docs_to_process(all_docs_to_check, all_docs_info, docs_in_use_info)
all_docs_info_new = {}
all_docs_info_new, docs_combination_ids_new = {}, {}
docs_in_use_info_new = []
# check if we need to process anything
if docs_and_types:
Expand Down Expand Up @@ -327,4 +333,7 @@ def upload_documents_save_info(
all_docs_info_new[file_id]["processed_text_link"] = doc_text_link
all_docs_info_new[file_id]["filename"] = filename
docs_in_use_info_new.append(file_id)
return docs_in_use_info_new, all_docs_info_new
if len(docs_in_use_info_new) > 1:
doc_combination_id = generate_unique_file_id(10, dialog_id)
docs_combination_ids_new[doc_combination_id] = docs_in_use_info_new
return docs_in_use_info_new, all_docs_info_new, docs_combination_ids_new
163 changes: 93 additions & 70 deletions skills/dff_meeting_analysis_skill/scenario/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
compose_and_upload_final_response,
get_older_gen_response,
get_name_and_text_from_file,
get_key_by_value,
)


Expand Down Expand Up @@ -89,86 +90,108 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
related_files = bot_utts[-1].get("user", {}).get("attributes", {}).get("related_files", {})
# check if we already received such request before and saved hyp for it to server
documents_in_use = context[-1].get("user", {}).get("attributes", {}).get("documents_in_use", [])
docs_combination_ids = (
context[-1].get("user", {}).get("attributes", {}).get("documents_combination_ids", {})
)
all_docs_info = context[-1].get("user", {}).get("attributes", {}).get("processed_documents", {})
sending_variables = compose_sending_variables({}, ENVVARS_TO_SEND, human_uttr_attributes)
hyps_and_names_all_docs = []
if documents_in_use:
_all_docs_have_summary = True
for document_in_use_id in documents_in_use:
if related_files.get(f"summary__{document_in_use_id}", None) is None:
_all_docs_have_summary = False
# if we need a weekly report, on this step we gather separate daily reports for each doc
# also here we change the type of summary prompt based on summary length request
_need_to_get_response_from_llm = True
# check if have final hypothesis for this request in case of multiple docs in use
if len(documents_in_use) > 1:
prompt_type_local, _ = set_correct_type_and_id(request, prompt_type_local)
curr_combination_id = get_key_by_value(docs_combination_ids, documents_in_use)
prompt_type_and_combination_id = f"{prompt_type_local}__{curr_combination_id}"
if prompt_type_and_combination_id in related_files.keys():
_need_to_get_response_from_llm = False
hypotheses = [get_older_gen_response(prompt_type_and_combination_id, related_files)]
# check if have final hypothesis for this request in case of one doc in use
else:
prompt_type_local, prompt_type_and_id = set_correct_type_and_id(
request, prompt_type_local, document_in_use_id
request, prompt_type_local, document_in_use_id=documents_in_use[0]
)
# we do not do anything unless we have the link to our file(s) in use
transcript_link = all_docs_info[document_in_use_id].get("processed_text_link", "")
if transcript_link:
# here we check if we already generated sth for the same request and the same doc
if prompt_type_and_id in related_files.keys():
# in the future, it is better to store filenames in related_files
# to avoid extra requests to file server
filename, _ = get_name_and_text_from_file(transcript_link)
older_response = get_older_gen_response(prompt_type_and_id, related_files)
hyp_and_name_one_doc = [(filename, older_response)]
# if no, let's generate it
else:
logger.info(
f"No earlier {prompt_type_and_id} found. \
if prompt_type_and_id in related_files.keys():
_need_to_get_response_from_llm = False
hypotheses = [get_older_gen_response(prompt_type_and_id, related_files)]

if _need_to_get_response_from_llm:
for document_in_use_id in documents_in_use:
if related_files.get(f"summary__{document_in_use_id}", None) is None:
_all_docs_have_summary = False
# if we need a weekly report, on this step we gather separate daily reports for each doc
# also here we change the type of summary prompt based on summary length request
prompt_type_local, prompt_type_and_id = set_correct_type_and_id(
request, prompt_type_local, document_in_use_id=document_in_use_id
)
# we do not do anything unless we have the link to our file(s) in use
transcript_link = all_docs_info[document_in_use_id].get("processed_text_link", "")
if transcript_link:
# here we check if we already generated sth for the same request and the same doc
if prompt_type_and_id in related_files.keys():
# in the future, it is better to store filenames in related_files
# to avoid extra requests to file server
filename, _ = get_name_and_text_from_file(transcript_link)
older_response = get_older_gen_response(prompt_type_and_id, related_files)
hyp_and_name_one_doc = [(filename, older_response)]
# if no, let's generate it
else:
logger.info(
f"No earlier {prompt_type_and_id} found. \
Sending request to generative model."
)
try:
filename, orig_text = get_name_and_text_from_file(transcript_link)
hyp_one_doc, related_files, _n_requests = get_and_upload_response_for_one_doc(
orig_text,
prompt_type_and_id,
dialog_context,
sending_variables,
related_files,
)
hyp_and_name_one_doc = [(filename, hyp_one_doc)]
n_requests += _n_requests
except Exception as e:
sentry_sdk.capture_exception(e)
logger.exception(e)
hyp_and_name_one_doc = []
try:
filename, orig_text = get_name_and_text_from_file(transcript_link)
hyp_one_doc, related_files, _n_requests = get_and_upload_response_for_one_doc(
orig_text,
prompt_type_and_id,
dialog_context,
sending_variables,
related_files,
)
hyp_and_name_one_doc = [(filename, hyp_one_doc)]
n_requests += _n_requests
except Exception as e:
sentry_sdk.capture_exception(e)
logger.exception(e)
hyp_and_name_one_doc = []
else:
hyp_and_name_one_doc = []
hyps_and_names_all_docs += hyp_and_name_one_doc

if prompt_type == "question_answering" and _all_docs_have_summary:
# if we are in `question_answering` node then
# the condition `go_to_question_answering` was requested once
n_requests += 1
# having got responses for all docs, let's make one response from it
# just return the response if we have one document and one response
if len(hyps_and_names_all_docs) == 1 and prompt_type_local != "weekly_report":
hypotheses = [hyps_and_names_all_docs[0][1]]
else:
hyp_and_name_one_doc = []
hyps_and_names_all_docs += hyp_and_name_one_doc

if prompt_type == "question_answering" and _all_docs_have_summary:
# if we are in `question_answering` node then
# the condition `go_to_question_answering` was requested once
n_requests += 1
# having got responses for all docs, let's make one response from it
# just return the response if we have one document and one response
if len(hyps_and_names_all_docs) == 1 and prompt_type_local != "weekly_report":
hypotheses = [hyps_and_names_all_docs[0][1]]
else:
# earlier we set prompt_type_and_id for weekly_analysis to full report for each doc,
# now we need it to set it back
prompt_type_and_id = f"{prompt_type_local}__{document_in_use_id}"
try:
# now by default we are passing filenames to LLM together with hypothesis for each file
# you can choose to pass only hypotheses (no filenames) by setting use_filenames=False
# when calling compose_and_upload_final_response()
hypotheses, related_files, _n_requests = compose_and_upload_final_response(
hyps_and_names_all_docs,
prompt_type_and_id,
dialog_context,
sending_variables,
related_files,
)
n_requests += _n_requests
except Exception as e:
sentry_sdk.capture_exception(e)
logger.exception(e)
hypotheses = []

# for full report and weekly report, add formatting
if prompt_type_local == "weekly_report" or prompt_type_local == "full_report":
hypotheses = postprocess_formatting(hypotheses, prompt_type=prompt_type_local)
# earlier we set prompt_type_and_id for weekly_analysis to full report for each doc,
# now we need it to set it back
prompt_type_and_id = f"{prompt_type_local}__{curr_combination_id}"
try:
# now by default we are passing filenames to LLM together with hypothesis for each file
# you can choose to pass only hypotheses (no filenames) by setting use_filenames=False
# when calling compose_and_upload_final_response()
hypotheses, related_files, _n_requests = compose_and_upload_final_response(
hyps_and_names_all_docs,
prompt_type_and_id,
dialog_context,
sending_variables,
related_files,
)
n_requests += _n_requests
except Exception as e:
sentry_sdk.capture_exception(e)
logger.exception(e)
hypotheses = []
# for full report and weekly report, add formatting
if prompt_type_local == "weekly_report" or prompt_type_local == "full_report":
hypotheses = postprocess_formatting(hypotheses, prompt_type=prompt_type_local)

# if there are docs in human utt attributes, but no processed docs in use were found
elif docs_in_attributes:
hyp_excuse = """Sorry, I failed to process the file you provided. \
Expand Down
Loading

0 comments on commit 88b3c9a

Please sign in to comment.