Skip to content

Commit c319433

Browse files
committed
Refactor type assertions to use isinstance, add a note in the README about heuristic nature of hierarchy building
1 parent 8cb030d commit c319433

6 files changed

Lines changed: 19 additions & 16 deletions

File tree

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99

1010
A parser for extracting headings and hierarchical structure from Markdown files.
1111

12+
> [!IMPORTANT]
13+
> This parser uses several heuristic rules to build hierarchies from diverse heading formats. Results may vary depending on document structure and formatting. Please review the [How Hierarchy is Built](#how-hierarchy-is-built) section before use to understand its capabilities and limitations for your specific use case.
14+
1215
## Features
1316

1417
- Parse multiple heading formats (hash `#`, asterisk `**`, inline with colon, all-caps)

src/headhunter/hierarchy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def _update_heading_stack(
246246
"""
247247
while heading_stack and heading_stack[-1][0] >= level:
248248
heading_stack.pop()
249-
assert type(token.metadata) is models.HeadingMetadata # for mypy
249+
assert isinstance(token.metadata, models.HeadingMetadata) # for mypy
250250
heading_stack.append((level, token.content, token.metadata))
251251

252252
def _should_pop_inline_heading(

src/headhunter/matcher.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ def _split_content_token(
399399

400400
# Create post-content tokens
401401
# Special handling for inline colon headings
402-
assert type(heading_token.metadata) is models.HeadingMetadata # for mypy
402+
assert isinstance(heading_token.metadata, models.HeadingMetadata) # for mypy
403403

404404
if heading_token.metadata.is_inline:
405405
if after_text:
@@ -526,7 +526,7 @@ def _find_and_extract_heading(
526526
heading_position_in_split = i
527527
break
528528

529-
assert type(heading_position_in_split) is int # for mypy
529+
assert isinstance(heading_position_in_split, int) # for mypy
530530
extracted_heading_idx = original_idx + heading_position_in_split
531531
heading_line_number = split_tokens[heading_position_in_split].line_number
532532

@@ -605,7 +605,7 @@ def match_headings(
605605
match_token, match_idx = exact_match_result
606606
last_matched_token_index = match_idx
607607

608-
assert type(match_token.metadata) is models.HeadingMetadata # for mypy
608+
assert isinstance(match_token.metadata, models.HeadingMetadata) # for mypy
609609

610610
matched_headings.append(
611611
{
@@ -657,7 +657,7 @@ def match_headings(
657657
last_matched_token_index = extracted_heading_idx
658658

659659
extracted_token = current_tokens[extracted_heading_idx]
660-
assert type(extracted_token.metadata) is models.HeadingMetadata # for mypy
660+
assert isinstance(extracted_token.metadata, models.HeadingMetadata) # for mypy
661661
heading_signature = extracted_token.metadata.signature
662662

663663
matched_headings.append(

src/headhunter/models.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -575,15 +575,15 @@ def match_headings(
575575
updated_documents.append(updated_doc)
576576

577577
match_pct = updated_doc.metadata["match_percentage"]
578-
assert type(match_pct) is float
578+
assert isinstance(match_pct, float)
579579
match_percentages.append(float(match_pct))
580580

581581
matched = updated_doc.metadata["matched_headings"]
582-
assert type(matched) is list
582+
assert isinstance(matched, list)
583583
all_matched_headings.extend(matched)
584584

585585
missing = updated_doc.metadata["missing_headings"]
586-
assert type(missing) is list
586+
assert isinstance(missing, list)
587587
all_missing_headings.extend(missing)
588588

589589
doc_id = str(updated_doc.metadata["id"])

src/headhunter/output.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def to_dict(
7777
}
7878

7979
parent = _pop_stack_to_parent_level(stack, ctx.level)
80-
assert type(parent["sections"]) is list # for mypy
80+
assert isinstance(parent["sections"], list) # for mypy
8181
parent["sections"].append(section)
8282
stack.append((ctx.level, section))
8383

@@ -91,7 +91,7 @@ def to_dict(
9191
"line_number": token.line_number,
9292
}
9393

94-
assert type(parent["sections"]) is list # for mypy
94+
assert isinstance(parent["sections"], list) # for mypy
9595
parent["sections"].append(content_item)
9696

9797
return root

tests/test_api.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def test_process_text(
2424
)
2525
actual_output = parsed_text.to_dict()
2626

27-
assert type(parsed_text) is ParsedText
27+
assert isinstance(parsed_text, ParsedText)
2828
assert actual_output == sample_mixed_json
2929

3030

@@ -57,7 +57,7 @@ def test_process_batch_df(
5757
json_files = parsed_batch.to_json(str(json_dir))
5858
tree_files = parsed_batch.to_tree(str(tree_dir))
5959

60-
assert type(parsed_batch) is ParsedBatch
60+
assert isinstance(parsed_batch, ParsedBatch)
6161
assert actual_dataframe.equals(sample_dataframe_parsed)
6262
assert (
6363
"Unknown custom configuration parameter(s) will be ignored: 'random_param'. "
@@ -104,7 +104,7 @@ def test_process_text_with_matcher(
104104

105105
actual_output = parsed_text.to_dict()
106106

107-
assert type(parsed_text) is ParsedText
107+
assert isinstance(parsed_text, ParsedText)
108108
assert actual_output == sample_match_json
109109

110110

@@ -139,7 +139,7 @@ def test_process_batch_df_with_matcher(
139139
# Reorder expected columns to match actual output for comparison
140140
actual_output = actual_output[sample_dataframe_match_parsed.columns]
141141

142-
assert type(parsed_batch) is ParsedBatch
142+
assert isinstance(parsed_batch, ParsedBatch)
143143
assert actual_output.equals(sample_dataframe_match_parsed)
144144

145145

@@ -219,7 +219,7 @@ def test_process_structured_df(
219219
)
220220
actual_dataframe = parsed_batch.to_dataframe()
221221

222-
assert type(parsed_batch) is ParsedBatch
222+
assert isinstance(parsed_batch, ParsedBatch)
223223
assert len(parsed_batch.documents) == len(sample_structured_dataframe)
224224
assert actual_dataframe.equals(sample_structured_parsed)
225225

@@ -240,7 +240,7 @@ def test_process_structured_df_auto_detect_columns(
240240
)
241241
actual_dataframe = parsed_batch.to_dataframe()
242242

243-
assert type(parsed_batch) is ParsedBatch
243+
assert isinstance(parsed_batch, ParsedBatch)
244244
assert len(parsed_batch.documents) == len(sample_structured_dataframe)
245245
assert actual_dataframe.equals(sample_structured_parsed)
246246

0 commit comments

Comments
 (0)