Skip to content

Commit

Permalink
Fix/correct split into chunks (#120)
Browse files Browse the repository at this point in the history
* fix in split_transcript_into_chunks - add separator; also include finding break points in split_transcript_into_chunks func

* improve logic in decide_where_to_break func

* improve readability

* improve split_transcript_into_chunks func
  • Loading branch information
smilni authored Nov 15, 2023
1 parent 2fe99ed commit 6220363
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 26 deletions.
40 changes: 18 additions & 22 deletions common/text_processing_for_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,25 @@ def check_token_number(text, model_name="gpt-3.5-turbo", enc=None):
return len_text


def decide_where_to_break(transcript, model_name="gpt-3.5-turbo", limit=3000, sep="\n"):
# the list of models with available encoders, see to define what model_name you need:
# https://github.com/openai/tiktoken/blob/39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80/tiktoken/model.py#L19
def split_transcript_into_chunks(transcript, model_name="gpt-3.5-turbo", limit=3000, sep="\n"):
transcript_list = transcript.split(sep)
enc = tiktoken.encoding_for_model(model_name)
len_tokens = 0
break_points = []
for n, utt in enumerate(transcript_list):
len_tokens += check_token_number(utt, enc=enc)
if len_tokens > limit:
len_tokens = check_token_number(utt, enc=enc)
break_points.append(n - 1)
return break_points


def split_transcript_into_chunks(transcript, break_points):
transcript_list = transcript.split("\n")
n_tokens_sep = check_token_number(sep, enc=enc)
transcript_chunks = []
start_point = 0
for break_point in break_points:
chunk = "\n".join(transcript_list[start_point:break_point])
transcript_chunks.append(chunk)
start_point = break_point
last_chunk = "\n".join(transcript_list[start_point:])
transcript_chunks.append(last_chunk)
transcript_chunk = ""
len_chunk = 0
for curr_part in transcript_list:
if not transcript_chunk:
transcript_chunk = curr_part
len_chunk = check_token_number(curr_part, enc=enc)
else:
n_tokens_curr_part = check_token_number(curr_part, enc=enc)
if len_chunk + n_tokens_sep + n_tokens_curr_part <= limit:
len_chunk += n_tokens_sep + n_tokens_curr_part
transcript_chunk += f"{sep}{curr_part}"
else:
transcript_chunks.append(transcript_chunk)
transcript_chunk = curr_part
len_chunk = n_tokens_curr_part
transcript_chunks.append(transcript_chunk)
return transcript_chunks
6 changes: 2 additions & 4 deletions skills/dff_meeting_analysis_skill/scenario/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from typing import List, Tuple
from common.text_processing_for_prompts import (
check_token_number,
decide_where_to_break,
split_transcript_into_chunks,
)
from common.containers import get_max_tokens_for_llm
Expand Down Expand Up @@ -222,10 +221,9 @@ def get_and_upload_response_for_one_doc(
# if we have multiple docs, we would like not to split one doc into two
# so in this case we split by special separator
if prompt_type == "weekly_report" or prompt_type == "combine_responses":
break_points = decide_where_to_break(orig_text, limit=token_limit, sep=SEP_FOR_DOC_RESPONSES)
transcript_chunks = split_transcript_into_chunks(orig_text, limit=token_limit, sep=SEP_FOR_DOC_RESPONSES)
else:
break_points = decide_where_to_break(orig_text, limit=token_limit)
transcript_chunks = split_transcript_into_chunks(orig_text, break_points)
transcript_chunks = split_transcript_into_chunks(orig_text, limit=token_limit)

# if asked for full report, we get parts of it separately and then just concatenate them
if prompt_type == "full_report":
Expand Down

0 comments on commit 6220363

Please sign in to comment.