Fix/correct split into chunks (#120)

* fix in split_transcript_into_chunks - add separator; also include finding break points in split_transcript_into_chunks func * improve logic in decide_where_to_break func * improve readability * improve split_transcript_into_chunks func
deeppavlov · Nov 15, 2023 · 6220363 · 6220363
1 parent 2fe99ed
commit 6220363
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 26 deletions.
diff --git a/common/text_processing_for_prompts.py b/common/text_processing_for_prompts.py
@@ -8,29 +8,25 @@ def check_token_number(text, model_name="gpt-3.5-turbo", enc=None):
     return len_text
 
 
-def decide_where_to_break(transcript, model_name="gpt-3.5-turbo", limit=3000, sep="\n"):
-    # the list of models with available encoders, see to define what model_name you need:
-    # https://github.com/openai/tiktoken/blob/39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80/tiktoken/model.py#L19
+def split_transcript_into_chunks(transcript, model_name="gpt-3.5-turbo", limit=3000, sep="\n"):
     transcript_list = transcript.split(sep)
     enc = tiktoken.encoding_for_model(model_name)
-    len_tokens = 0
-    break_points = []
-    for n, utt in enumerate(transcript_list):
-        len_tokens += check_token_number(utt, enc=enc)
-        if len_tokens > limit:
-            len_tokens = check_token_number(utt, enc=enc)
-            break_points.append(n - 1)
-    return break_points
-
-
-def split_transcript_into_chunks(transcript, break_points):
-    transcript_list = transcript.split("\n")
+    n_tokens_sep = check_token_number(sep, enc=enc)
     transcript_chunks = []
-    start_point = 0
-    for break_point in break_points:
-        chunk = "\n".join(transcript_list[start_point:break_point])
-        transcript_chunks.append(chunk)
-        start_point = break_point
-    last_chunk = "\n".join(transcript_list[start_point:])
-    transcript_chunks.append(last_chunk)
+    transcript_chunk = ""
+    len_chunk = 0
+    for curr_part in transcript_list:
+        if not transcript_chunk:
+            transcript_chunk = curr_part
+            len_chunk = check_token_number(curr_part, enc=enc)
+        else:
+            n_tokens_curr_part = check_token_number(curr_part, enc=enc)
+            if len_chunk + n_tokens_sep + n_tokens_curr_part <= limit:
+                len_chunk += n_tokens_sep + n_tokens_curr_part
+                transcript_chunk += f"{sep}{curr_part}"
+            else:
+                transcript_chunks.append(transcript_chunk)
+                transcript_chunk = curr_part
+                len_chunk = n_tokens_curr_part
+    transcript_chunks.append(transcript_chunk)
     return transcript_chunks
diff --git a/skills/dff_meeting_analysis_skill/scenario/utils.py b/skills/dff_meeting_analysis_skill/scenario/utils.py
@@ -7,7 +7,6 @@
 from typing import List, Tuple
 from common.text_processing_for_prompts import (
     check_token_number,
-    decide_where_to_break,
     split_transcript_into_chunks,
 )
 from common.containers import get_max_tokens_for_llm
@@ -222,10 +221,9 @@ def get_and_upload_response_for_one_doc(
     # if we have multiple docs, we would like not to split one doc into two
     # so in this case we split by special separator
     if prompt_type == "weekly_report" or prompt_type == "combine_responses":
-        break_points = decide_where_to_break(orig_text, limit=token_limit, sep=SEP_FOR_DOC_RESPONSES)
+        transcript_chunks = split_transcript_into_chunks(orig_text, limit=token_limit, sep=SEP_FOR_DOC_RESPONSES)
     else:
-        break_points = decide_where_to_break(orig_text, limit=token_limit)
-    transcript_chunks = split_transcript_into_chunks(orig_text, break_points)
+        transcript_chunks = split_transcript_into_chunks(orig_text, limit=token_limit)
 
     # if asked for full report, we get parts of it separately and then just concatenate them
     if prompt_type == "full_report":