Skip to content

Commit 8e12b29

Browse files
committed
Add new document fixtures and update tests for batch processing to improve coverage.
1 parent b4dee39 commit 8e12b29

15 files changed

Lines changed: 608 additions & 4 deletions

File tree

src/headhunter/output.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ def to_json_file(
118118

119119
with open(filepath, "w", encoding="utf-8") as f:
120120
json.dump(hierarchical_data, f, indent=indent, ensure_ascii=False)
121+
f.write("\n")
121122

122123
logger.debug(f"Exported JSON to {filepath}")
123124
return str(filepath)
@@ -204,7 +205,7 @@ def to_tree_string(
204205

205206
lines.append(f"{prefix}{label}")
206207

207-
return "\n".join(lines)
208+
return "\n".join(lines) + "\n"
208209

209210

210211
def _ensure_output_directory(output_dir: str | pathlib.Path) -> pathlib.Path:

tests/conftest.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,47 @@ def sample_dataframe_parsed() -> pd.DataFrame:
5151
df["parent_types"] = df["parent_types"].apply(ast.literal_eval)
5252

5353
return df
54+
55+
56+
@pytest.fixture
57+
def sample_dataframe_match() -> pd.DataFrame:
58+
"""Sample DataFrame with markdown content for batch processing with matcher."""
59+
return pd.read_csv(
60+
pathlib.Path(__file__).parent / "fixtures" / "sample_data_match.csv"
61+
)
62+
63+
64+
@pytest.fixture
65+
def sample_dataframe_match_parsed() -> pd.DataFrame:
66+
"""Expected parsed output for sample_dataframe_match with matcher."""
67+
path = pathlib.Path(__file__).parent / "fixtures" / "sample_data_match_parsed.csv"
68+
df = pd.read_csv(path)
69+
70+
# Convert string representations of lists back to actual lists
71+
df["parents"] = df["parents"].apply(ast.literal_eval)
72+
df["parent_types"] = df["parent_types"].apply(ast.literal_eval)
73+
df["matched_headings"] = df["matched_headings"].apply(ast.literal_eval)
74+
df["missing_headings"] = df["missing_headings"].apply(ast.literal_eval)
75+
76+
return df
77+
78+
79+
@pytest.fixture
80+
def expected_json_files() -> dict[str, dict]:
81+
"""Expected JSON output files for batch processing tests."""
82+
json_dir = pathlib.Path(__file__).parent / "fixtures" / "expected_json"
83+
result = {}
84+
for json_file in sorted(json_dir.glob("*.json")):
85+
with open(json_file) as f:
86+
result[json_file.name] = json.load(f)
87+
return result
88+
89+
90+
@pytest.fixture
91+
def expected_tree_files() -> dict[str, str]:
92+
"""Expected tree output files for batch processing tests."""
93+
tree_dir = pathlib.Path(__file__).parent / "fixtures" / "expected_tree"
94+
result = {}
95+
for tree_file in sorted(tree_dir.glob("*.txt")):
96+
result[tree_file.name] = tree_file.read_text()
97+
return result
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"row_index": 0,
3+
"id": "doc1",
4+
"category": "A",
5+
"priority": 1,
6+
"sections": [
7+
{
8+
"type": "heading",
9+
"text": "Document 1",
10+
"level": 1,
11+
"line_number": 1,
12+
"metadata": {
13+
"marker": "#",
14+
"marker_count": 1,
15+
"case": "title_case",
16+
"is_inline": false,
17+
"is_extracted": false,
18+
"extraction_position": null,
19+
"signature": "#1"
20+
},
21+
"sections": [
22+
{
23+
"type": "content",
24+
"text": "This is the first document with some content.",
25+
"level": 2,
26+
"line_number": 3
27+
},
28+
{
29+
"type": "heading",
30+
"text": "Section 1.1",
31+
"level": 2,
32+
"line_number": 5,
33+
"metadata": {
34+
"marker": "#",
35+
"marker_count": 2,
36+
"case": "title_case",
37+
"is_inline": false,
38+
"is_extracted": false,
39+
"extraction_position": null,
40+
"signature": "#2"
41+
},
42+
"sections": [
43+
{
44+
"type": "content",
45+
"text": "More details here.",
46+
"level": 3,
47+
"line_number": 7
48+
}
49+
]
50+
}
51+
]
52+
}
53+
]
54+
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
{
2+
"row_index": 1,
3+
"id": "doc2",
4+
"category": "B",
5+
"priority": 2,
6+
"sections": [
7+
{
8+
"type": "heading",
9+
"text": "Document 2",
10+
"level": 1,
11+
"line_number": 1,
12+
"metadata": {
13+
"marker": "#",
14+
"marker_count": 1,
15+
"case": "title_case",
16+
"is_inline": false,
17+
"is_extracted": false,
18+
"extraction_position": null,
19+
"signature": "#1"
20+
},
21+
"sections": [
22+
{
23+
"type": "heading",
24+
"text": "Overview",
25+
"level": 2,
26+
"line_number": 3,
27+
"metadata": {
28+
"marker": "#",
29+
"marker_count": 2,
30+
"case": "title_case",
31+
"is_inline": false,
32+
"is_extracted": false,
33+
"extraction_position": null,
34+
"signature": "#2"
35+
},
36+
"sections": [
37+
{
38+
"type": "content",
39+
"text": "Second document overview.",
40+
"level": 3,
41+
"line_number": 5
42+
},
43+
{
44+
"type": "heading",
45+
"text": "Details",
46+
"level": 3,
47+
"line_number": 7,
48+
"metadata": {
49+
"marker": "#",
50+
"marker_count": 3,
51+
"case": "title_case",
52+
"is_inline": false,
53+
"is_extracted": false,
54+
"extraction_position": null,
55+
"signature": "#3"
56+
},
57+
"sections": [
58+
{
59+
"type": "content",
60+
"text": "Nested content.",
61+
"level": 4,
62+
"line_number": 9
63+
}
64+
]
65+
}
66+
]
67+
}
68+
]
69+
}
70+
]
71+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
{
2+
"row_index": 2,
3+
"id": "doc3",
4+
"category": "A",
5+
"priority": 3,
6+
"sections": [
7+
{
8+
"type": "heading",
9+
"text": "Document number",
10+
"level": 1,
11+
"line_number": 1,
12+
"metadata": {
13+
"marker": "*",
14+
"marker_count": 2,
15+
"case": "sentence_case",
16+
"is_inline": true,
17+
"is_extracted": false,
18+
"extraction_position": null,
19+
"signature": "*2-inline"
20+
},
21+
"sections": [
22+
{
23+
"type": "content",
24+
"text": "3",
25+
"level": 2,
26+
"line_number": 1
27+
}
28+
]
29+
},
30+
{
31+
"type": "heading",
32+
"text": "Document type",
33+
"level": 1,
34+
"line_number": 3,
35+
"metadata": {
36+
"marker": "*",
37+
"marker_count": 2,
38+
"case": "title_case",
39+
"is_inline": true,
40+
"is_extracted": false,
41+
"extraction_position": null,
42+
"signature": "*2-inline"
43+
},
44+
"sections": [
45+
{
46+
"type": "content",
47+
"text": "Markdown",
48+
"level": 2,
49+
"line_number": 3
50+
}
51+
]
52+
},
53+
{
54+
"type": "content",
55+
"text": "Simple document with minimal structure.",
56+
"level": 1,
57+
"line_number": 5
58+
}
59+
]
60+
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
{
2+
"row_index": 3,
3+
"id": "doc4",
4+
"category": "C",
5+
"priority": 1,
6+
"sections": [
7+
{
8+
"type": "heading",
9+
"text": "Document 4",
10+
"level": 1,
11+
"line_number": 1,
12+
"metadata": {
13+
"marker": "#",
14+
"marker_count": 1,
15+
"case": "title_case",
16+
"is_inline": false,
17+
"is_extracted": false,
18+
"extraction_position": null,
19+
"signature": "#1"
20+
},
21+
"sections": [
22+
{
23+
"type": "heading",
24+
"text": "Introduction",
25+
"level": 2,
26+
"line_number": 3,
27+
"metadata": {
28+
"marker": "#",
29+
"marker_count": 2,
30+
"case": "title_case",
31+
"is_inline": false,
32+
"is_extracted": false,
33+
"extraction_position": null,
34+
"signature": "#2"
35+
},
36+
"sections": [
37+
{
38+
"type": "content",
39+
"text": "Fourth document.",
40+
"level": 3,
41+
"line_number": 5
42+
}
43+
]
44+
},
45+
{
46+
"type": "heading",
47+
"text": "Conclusion",
48+
"level": 2,
49+
"line_number": 7,
50+
"metadata": {
51+
"marker": "#",
52+
"marker_count": 2,
53+
"case": "title_case",
54+
"is_inline": false,
55+
"is_extracted": false,
56+
"extraction_position": null,
57+
"signature": "#2"
58+
},
59+
"sections": [
60+
{
61+
"type": "content",
62+
"text": "Final thoughts.",
63+
"level": 3,
64+
"line_number": 9
65+
}
66+
]
67+
}
68+
]
69+
}
70+
]
71+
}

0 commit comments

Comments
 (0)