-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathphrase_splitter.py
75 lines (66 loc) · 2.61 KB
/
phrase_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import openai
from promptengine.pipelines import PromptPipeline
from promptengine.template import PromptTemplate, PromptPermutationGenerator
from promptengine.utils import LLM, extract_responses
import spacy
import string
nlp = spacy.load("en_core_web_sm")
TEMPERATURE = 1 #The temperature for ChatGPT calls
SENTENCE_SEGMENTER_PROMPT_TEMPLATE = \
"""Does the following sentence end properly?
"${sentence}"
Please answer only Yes or No."""
class SentenceSegmenterPromptPipeline(PromptPipeline):
def __init__(self):
self._template = PromptTemplate(SENTENCE_SEGMENTER_PROMPT_TEMPLATE)
storageFile = 'responses.json'
super().__init__(storageFile)
def gen_prompts(self, properties):
gen_prompts = PromptPermutationGenerator(self._template)
return list(gen_prompts({
"sentence": properties["sentence"]
}))
def split_and_concatenate(sentence):
# Remove punctuation at the end of the sentence
result = []
sentence = sentence.rstrip(string.punctuation)
# Split the sentence into words
tokens = []
doc = nlp(sentence.strip())
for token in doc:
tokens.append(token.text)
if token.head.i > token.i: # current word has a parent on the right of it, so ending the sentence here will not make sense.
continue
result.append((' '.join(tokens)))
return result
def strip_wrapping_quotes(s: str) -> str:
if s[0] == '"': s = s[1:]
if s[-1] == '"': s = s[0:-1]
return s
def extract_new_phrases(sentences):
new_phrases = []
previous_sentence = ""
for sentence in sentences:
# Find the part of the sentence that is new compared to the previous one
if sentence.startswith(previous_sentence):
new_phrase = sentence[len(previous_sentence):]
else:
new_phrase = sentence
new_phrases.append(new_phrase)
previous_sentence = sentence
return new_phrases
def find_segments(sentence, k):
openai.api_key = k
sentence_segmenter = SentenceSegmenterPromptPipeline()
result = []
for candidate in split_and_concatenate(sentence):
responses = []
sentence_segmenter.clear_cached_responses()
for res in sentence_segmenter.gen_responses({"sentence": candidate}, LLM.ChatGPT, n=1, temperature=TEMPERATURE):
responses.extend(extract_responses(res, llm=LLM.ChatGPT))
responses = [strip_wrapping_quotes(r) for r in responses]
if 'yes' in responses[0].lower():
result.append(candidate)
if len(sentence) > 0 and len(result) == 0:
result.append(sentence)
return extract_new_phrases(result)