Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 43b24fd

Browse files
authoredNov 8, 2022
fix: strip whitespaces safely from FARMReader's answers (#3526)
* remove .strip() * check for right-side offset * return the whitespace-cleaned answer * lstrip, not rstrip :D * remove int * left_offset * slightly refactor reader fixture * extend test_output
1 parent e6b7109 commit 43b24fd

File tree

4 files changed

+60
-80
lines changed

4 files changed

+60
-80
lines changed
 

‎haystack/modeling/model/predictions.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -201,13 +201,18 @@ def _span_to_string(self, token_offsets: List[int], clear_text: str) -> Tuple[st
201201
# final_text can be an empty string if start_t points to the very final token of the passage
202202
# final_text can be a whitespace if there is a whitespace token in the text, e.g.,
203203
# if the original text contained multiple consecutive whitespaces
204-
if len(final_text.strip()) > 0:
205-
final_text = final_text.strip()
206-
else:
204+
cleaned_final_text = final_text.strip()
205+
if not cleaned_final_text:
207206
return "", 0, 0
208-
end_ch = int(start_ch + len(final_text))
209207

210-
return final_text, start_ch, end_ch
208+
# Adjust the offsets in case of whitespace at the beginning of the answer
209+
left_offset = len(final_text) - len(final_text.lstrip())
210+
if left_offset:
211+
start_ch = start_ch + left_offset
212+
213+
end_ch = start_ch + len(cleaned_final_text)
214+
215+
return cleaned_final_text, start_ch, end_ch
211216

212217
def to_doc_level(self, start: int, end: int):
213218
"""

‎test/conftest.py

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -720,40 +720,6 @@ def indexing_document_classifier():
720720
)
721721

722722

723-
# TODO Fix bug in test_no_answer_output when using
724-
# @pytest.fixture(params=["farm", "transformers"])
725-
@pytest.fixture(params=["farm"])
726-
def no_answer_reader(request):
727-
if request.param == "farm":
728-
return FARMReader(
729-
model_name_or_path="deepset/bert-medium-squad2-distilled",
730-
use_gpu=False,
731-
top_k_per_sample=5,
732-
no_ans_boost=0,
733-
return_no_answer=True,
734-
num_processes=0,
735-
)
736-
if request.param == "transformers":
737-
return TransformersReader(
738-
model_name_or_path="deepset/bert-medium-squad2-distilled",
739-
tokenizer="deepset/bert-medium-squad2-distilled",
740-
use_gpu=-1,
741-
top_k_per_candidate=5,
742-
)
743-
744-
745-
@pytest.fixture
746-
def prediction(reader, docs):
747-
prediction = reader.predict(query="Who lives in Berlin?", documents=docs, top_k=5)
748-
return prediction
749-
750-
751-
@pytest.fixture
752-
def no_answer_prediction(no_answer_reader, docs):
753-
prediction = no_answer_reader.predict(query="What is the meaning of life?", documents=docs, top_k=5)
754-
return prediction
755-
756-
757723
@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf", "table_text_retriever"])
758724
def retriever(request, document_store):
759725
return get_retriever(request.param, document_store)

‎test/modeling/test_question_answering.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,21 @@
77
from haystack.modeling.data_handler.inputs import QAInput, Question
88

99

10+
DOC_TEXT = """Twilight Princess was released to universal critical acclaim and commercial success. \
11+
It received perfect scores from major publications such as 1UP.com, Computer and Video Games, \
12+
Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators \
13+
GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii \
14+
version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called \
15+
it one of the greatest games ever created."""
16+
17+
1018
@pytest.fixture()
1119
def span_inference_result(bert_base_squad2, caplog=None):
1220
if caplog:
1321
caplog.set_level(logging.CRITICAL)
1422
obj_input = [
1523
QAInput(
16-
doc_text="Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
17-
questions=Question("Who counted the game among the best ever made?", uid="best_id_ever"),
24+
doc_text=DOC_TEXT, questions=Question("Who counted the game among the best ever made?", uid="best_id_ever")
1825
)
1926
]
2027
result = bert_base_squad2.inference_from_objects(obj_input, return_json=False)[0]
@@ -27,7 +34,13 @@ def no_answer_inference_result(bert_base_squad2, caplog=None):
2734
caplog.set_level(logging.CRITICAL)
2835
obj_input = [
2936
QAInput(
30-
doc_text='The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names. The Amazon represents over half of the planet\'s remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.',
37+
doc_text="""\
38+
The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by
39+
Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana,
40+
Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names.
41+
The Amazon represents over half of the planet\'s remaining rainforests, and comprises the largest
42+
and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual
43+
trees divided into 16,000 species.""",
3144
questions=Question(
3245
"The Amazon represents less than half of the planets remaining what?", uid="best_id_ever"
3346
),
@@ -38,17 +51,9 @@ def no_answer_inference_result(bert_base_squad2, caplog=None):
3851

3952

4053
def test_inference_different_inputs(bert_base_squad2):
41-
qa_format_1 = [
42-
{
43-
"questions": ["Who counted the game among the best ever made?"],
44-
"text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
45-
}
46-
]
54+
qa_format_1 = [{"questions": ["Who counted the game among the best ever made?"], "text": DOC_TEXT}]
4755
q = Question(text="Who counted the game among the best ever made?")
48-
qa_format_2 = QAInput(
49-
questions=[q],
50-
doc_text="Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
51-
)
56+
qa_format_2 = QAInput(questions=[q], doc_text=DOC_TEXT)
5257

5358
result1 = bert_base_squad2.inference_from_dicts(dicts=qa_format_1)
5459
result2 = bert_base_squad2.inference_from_objects(objects=[qa_format_2])
@@ -60,8 +65,7 @@ def test_span_inference_result_ranking_by_confidence(bert_base_squad2, caplog=No
6065
caplog.set_level(logging.CRITICAL)
6166
obj_input = [
6267
QAInput(
63-
doc_text="Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
64-
questions=Question("Who counted the game among the best ever made?", uid="best_id_ever"),
68+
doc_text=DOC_TEXT, questions=Question("Who counted the game among the best ever made?", uid="best_id_ever")
6569
)
6670
]
6771

‎test/nodes/test_reader.py

Lines changed: 31 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,47 @@
77

88
from haystack.schema import Document, Answer
99
from haystack.nodes.reader.base import BaseReader
10-
from haystack.nodes.reader.farm import FARMReader
10+
from haystack.nodes import FARMReader, TransformersReader
11+
12+
13+
# TODO Fix bug in test_no_answer_output when using
14+
# @pytest.fixture(params=["farm", "transformers"])
15+
@pytest.fixture(params=["farm"])
16+
def no_answer_reader(request):
17+
if request.param == "farm":
18+
return FARMReader(
19+
model_name_or_path="deepset/bert-medium-squad2-distilled",
20+
use_gpu=False,
21+
top_k_per_sample=5,
22+
no_ans_boost=0,
23+
return_no_answer=True,
24+
num_processes=0,
25+
)
26+
if request.param == "transformers":
27+
return TransformersReader(
28+
model_name_or_path="deepset/bert-medium-squad2-distilled",
29+
tokenizer="deepset/bert-medium-squad2-distilled",
30+
use_gpu=-1,
31+
top_k_per_candidate=5,
32+
)
1133

1234

1335
def test_reader_basic(reader):
1436
assert reader is not None
1537
assert isinstance(reader, BaseReader)
1638

1739

18-
def test_output(prediction):
40+
def test_output(reader, docs):
41+
prediction = reader.predict(query="Who lives in Berlin?", documents=docs, top_k=5)
1942
assert prediction is not None
2043
assert prediction["query"] == "Who lives in Berlin?"
2144
assert prediction["answers"][0].answer == "Carla"
2245
assert prediction["answers"][0].offsets_in_context[0].start == 11
2346
assert prediction["answers"][0].offsets_in_context[0].end == 16
24-
assert prediction["answers"][0].score <= 1
25-
assert prediction["answers"][0].score >= 0
47+
assert prediction["answers"][0].offsets_in_document[0].start == 11
48+
assert prediction["answers"][0].offsets_in_document[0].end == 16
49+
assert prediction["answers"][0].type == "extractive"
50+
assert 0 <= prediction["answers"][0].score <= 1
2651
assert prediction["answers"][0].context == "My name is Carla and I live in Berlin"
2752
assert len(prediction["answers"]) == 5
2853

@@ -80,7 +105,8 @@ def test_output_batch_multiple_queries_multiple_doc_lists(reader, docs):
80105

81106

82107
@pytest.mark.integration
83-
def test_no_answer_output(no_answer_prediction):
108+
def test_no_answer_output(no_answer_reader, docs):
109+
no_answer_prediction = no_answer_reader.predict(query="What is the meaning of life?", documents=docs, top_k=5)
84110
assert no_answer_prediction is not None
85111
assert no_answer_prediction["query"] == "What is the meaning of life?"
86112
assert math.isclose(no_answer_prediction["no_ans_gap"], 0.9094805717468262, rel_tol=0.0001)
@@ -96,34 +122,13 @@ def test_no_answer_output(no_answer_prediction):
96122
assert len(no_answer_prediction["answers"]) == 5
97123

98124

99-
# TODO Directly compare farm and transformers reader outputs
100-
# TODO checks to see that model is responsive to input arguments e.g. context_window_size - topk
101-
102-
103-
@pytest.mark.integration
104-
def test_prediction_attributes(prediction):
105-
# TODO FARM's prediction also has no_ans_gap
106-
attributes_gold = ["query", "answers"]
107-
for ag in attributes_gold:
108-
assert ag in prediction
109-
110-
111125
@pytest.mark.integration
112126
def test_model_download_options():
113127
# download disabled and model is not cached locally
114128
with pytest.raises(OSError):
115129
impossible_reader = FARMReader("mfeb/albert-xxlarge-v2-squad2", local_files_only=True, num_processes=0)
116130

117131

118-
def test_answer_attributes(prediction):
119-
# TODO Transformers answer also has meta key
120-
answer = prediction["answers"][0]
121-
assert type(answer) == Answer
122-
attributes_gold = ["answer", "score", "context", "offsets_in_context", "offsets_in_document", "type"]
123-
for ag in attributes_gold:
124-
assert getattr(answer, ag, None) is not None
125-
126-
127132
@pytest.mark.integration
128133
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
129134
@pytest.mark.parametrize("window_size", [10, 15, 20])

0 commit comments

Comments
 (0)
Please sign in to comment.