Skip to content

Commit 4ee1e2f

Browse files
authored
Feature/fix bare citations script (#115)
* CDL: script that checks for known citation keys without @ symbol * CDL: added bare-citation fix to Makefile * github action to check for bare citations * update and rename citation-check script to work with github actions * CDL: couple typos and fixes in citation check workflow * updated find_bare_citations.py to skip comment blocks * CDL: remove citation-fix from Makefile * CDL: switch to argparse+Path * Optimization changes for running faster * Requested change to use yield * CDL: fix return value * CDL: actually fix return value * Revert "CDL: fix return value", wrong file. This reverts commit 2675b7e. * MUCH faster citation check by simpler regex * CDL: edited citation-extraction as requested * CDL: some cleanup items * CDL: eliminate redundant var assignment * bare-citation-check: Only find line if issue exists * CDL: updating type hints * CDL: adding a comment * CDL: fixing the type hint --------- Co-authored-by: Colin Leong <--unset>
1 parent c6037ea commit 4ee1e2f

File tree

2 files changed

+124
-0
lines changed

2 files changed

+124
-0
lines changed
+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
name: Find bare citations
2+
3+
on:
4+
pull_request:
5+
branches: [ main, master ]
6+
push:
7+
branches: [ main, master ]
8+
9+
jobs:
10+
format-code:
11+
runs-on: ubuntu-latest
12+
13+
steps:
14+
- uses: actions/checkout@v4
15+
16+
- uses: actions/setup-python@v5
17+
with:
18+
python-version: '3.10'
19+
20+
- name: Run checker script
21+
run: python src/find_bare_citations.py src/references.bib src/index.md

src/find_bare_citations.py

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# Find bare citation keys, e.g. shalev2022ham2pose, and add "@" e.g @shalev2022ham2pose
2+
# Does this by reading the citation keys from references.bib and checking the markdown for matches.
3+
# Written with ChatGPT assistance
4+
# https://chatgpt.com/share/68cde216-5aeb-41e1-a4b2-93e0855f6b98
5+
# https://chatgpt.com/share/00b023dc-74b8-4cd1-8b78-eadf39658688
6+
# https://chatgpt.com/share/3c145c68-dc38-43f0-a2b1-27d3cd1d43f5
7+
import re
8+
import sys
9+
from pathlib import Path
10+
import argparse
11+
import timeit
12+
import uuid
13+
from typing import Generator, List, Tuple
14+
15+
16+
def extract_citation_keys(bib_file_path:Path) ->List[str]:
17+
content = bib_file_path.read_text()
18+
citation_keys = re.findall(r"@\w+\{([^,]+),", content)
19+
return citation_keys
20+
21+
def find_bare_citations(markdown_file_path: Path, citation_keys: List) -> Generator[Tuple[str, List[str]], None, None ]:
22+
content = markdown_file_path.read_text()
23+
24+
# Remove HTML comments. regex from https://stackoverflow.com/a/28208465
25+
content = re.sub(r"<!--.*?-->", "", content, flags=re.DOTALL)
26+
27+
# Remove Markdown code blocks, regex from https://stackoverflow.com/a/64116935
28+
markdown_code_block_pattern = re.compile(r'```[^`]*```', re.DOTALL)
29+
content = markdown_code_block_pattern.sub('', content)
30+
31+
for citation_key in citation_keys:
32+
# Find all positions of the citation key without the @ symbol
33+
key_pattern = re.compile(re.escape(citation_key))
34+
matches = []
35+
for match in key_pattern.finditer(content):
36+
start_index = match.start()
37+
# Check if the citation key is not immediately preceded by an @ symbol
38+
if '@' not in content[start_index-1:start_index]:
39+
# if the @ is missing, pull out the whole line and return it.
40+
line_start = content.rfind('\n', 0, start_index) + 1
41+
line_end = content.find('\n', start_index)
42+
if line_end == -1:
43+
line_end = len(content)
44+
line = content[line_start:line_end]
45+
matches.append(line)
46+
47+
48+
49+
if matches:
50+
yield citation_key, matches
51+
52+
53+
if __name__ == "__main__":
54+
55+
parser = argparse.ArgumentParser()
56+
parser.add_argument(
57+
"bib_file_path",
58+
type=Path
59+
)
60+
61+
parser.add_argument(
62+
"markdown_file_path",
63+
type=Path
64+
)
65+
66+
67+
args = parser.parse_args()
68+
69+
print(f"Parsing {args.bib_file_path} for citations")
70+
extract_citations_start = timeit.default_timer()
71+
citation_keys = extract_citation_keys(args.bib_file_path)
72+
extract_citations_time = timeit.default_timer() - extract_citations_start
73+
print(f"Finding citations took {extract_citations_time} seconds")
74+
75+
76+
print(f"Bibliography had {len(citation_keys)} citations")
77+
78+
print(f"Beginning bare-citations check: checking {args.markdown_file_path}")
79+
80+
start_time = timeit.default_timer()
81+
issues = find_bare_citations(args.markdown_file_path, citation_keys)
82+
83+
84+
print("Found the following lines with bare citations:")
85+
print()
86+
87+
# we cannot simply check "if issues" due to using yield
88+
issues_exist = False
89+
for citation_key, matches in issues:
90+
print(f"Citation key: {citation_key}: {len(matches)} bare citations")
91+
92+
for i, match in enumerate(matches):
93+
print(f"{i}: {match}")
94+
95+
# iff we've gotten here then issues exist and we should set return value to 1 at the end.
96+
issues_exist = True
97+
print()
98+
elapsed_time = timeit.default_timer() - start_time
99+
print(f"Bare-citation check complete after ~{elapsed_time} seconds")
100+
if issues_exist:
101+
sys.exit(1) # exit with an error
102+
103+

0 commit comments

Comments
 (0)