Skip to content

Commit 73a8b4e

Browse files
committed
Refactor code structure for improved readability and maintainability
1 parent ce14315 commit 73a8b4e

File tree

6 files changed

+375
-23
lines changed

6 files changed

+375
-23
lines changed
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
name: Publish Python 🐍 distribution to PyPI on version bump
2+
3+
on:
4+
push:
5+
branches: [main]
6+
paths:
7+
- 'pyproject.toml'
8+
9+
jobs:
10+
publish-to-pypi:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- name: Checkout code
14+
uses: actions/checkout@v4
15+
16+
- name: Set up Python
17+
uses: actions/setup-python@v5
18+
with:
19+
python-version: '3.11'
20+
21+
- name: Install uv
22+
run: |
23+
pip install uv
24+
25+
- name: Install build dependencies with uv
26+
run: |
27+
uv pip install build toml requests packaging
28+
29+
- name: Get current version from pyproject.toml
30+
id: get_version
31+
run: |
32+
python -c "import toml; print(toml.load('pyproject.toml')['project']['version'])" > version.txt
33+
echo "version=$(cat version.txt)" >> $GITHUB_OUTPUT
34+
35+
- name: Get latest version from PyPI
36+
id: get_latest
37+
run: |
38+
PKG_NAME=$(python -c "import toml; print(toml.load('pyproject.toml')['project']['name'])")
39+
curl -s https://pypi.org/pypi/$PKG_NAME/json | \
40+
python -c "import sys, json; print(json.load(sys.stdin)['info']['version']) if sys.stdin.read() else print('0.0.0')" > latest.txt
41+
echo "latest=$(cat latest.txt)" >> $GITHUB_OUTPUT
42+
43+
- name: Compare versions and set publish flag
44+
id: check_version
45+
run: |
46+
python -c "from packaging.version import parse; import os; v1 = parse(os.environ['VERSION']); v2 = parse(os.environ['LATEST']); print('::set-output name=publish::' + str(v1 > v2))" \
47+
VERSION=${{ steps.get_version.outputs.version }} LATEST=${{ steps.get_latest.outputs.latest }} > publish.txt
48+
echo "publish=$(cat publish.txt | tail -n1 | cut -d'=' -f2)" >> $GITHUB_OUTPUT
49+
50+
- name: Build package
51+
if: steps.check_version.outputs.publish == 'True'
52+
run: uv pip run -m build
53+
54+
- name: Publish to PyPI
55+
if: steps.check_version.outputs.publish == 'True'
56+
uses: pypa/gh-action-pypi-publish@release/v1
57+
with:
58+
user: ${{ secrets.PYPI_USERNAME }}
59+
password: ${{ secrets.PYPI_PASSWORD }}
60+
# Alternatively, use 'password: ${{ secrets.PYPI_API_TOKEN }}' if using an API token

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,7 @@ analysis_output/
44
dev/
55
analysis_ba_daniel_grundlagen2/
66
analysis_ba_daniel_grundlagen*/**
7-
output/
7+
output/
8+
ENHANCEMENTS.md
9+
CLAUDE.md
10+
ProjectPlan.md

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ build-backend = "hatchling.build"
4141

4242
[dependency-groups]
4343
dev = [
44+
"ipykernel>=6.30.1",
4445
"marimo>=0.14.16",
4546
"pdf2text>=1.0.0",
4647
]

src/veritascribe/config.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -136,21 +136,29 @@ def format_model_name(self, model_name: Optional[str] = None) -> str:
136136
return model
137137

138138
def get_provider_specific_max_tokens(self) -> int:
139-
"""Get provider-specific max token limits optimized for each provider."""
140-
provider_token_limits = {
141-
"openai": min(self.max_tokens, 4000), # OpenAI models generally handle larger contexts well
142-
"openrouter": min(self.max_tokens, 3000), # More conservative for free/cheaper models
143-
"anthropic": min(self.max_tokens, 4000), # Claude handles large contexts well
144-
"custom": min(self.max_tokens, 4000) # Conservative for unknown endpoints
139+
"""Get provider-specific max token limits, respecting user configuration."""
140+
# Use user's max_tokens as the primary value
141+
user_max_tokens = self.max_tokens
142+
143+
# Define reasonable upper bounds per provider (only used as safety caps)
144+
provider_max_caps = {
145+
"openai": 8000, # OpenAI models generally handle larger contexts well
146+
"openrouter": 8000, # More conservative for free/cheaper models
147+
"anthropic": 4000, # Claude handles large contexts well
148+
"custom": 8000 # Conservative for unknown endpoints
145149
}
146150

151+
# Apply provider-specific cap only if user's setting exceeds it
152+
provider_cap = provider_max_caps.get(self.llm_provider, 8000)
153+
base_tokens = min(user_max_tokens, provider_cap)
154+
147155
# For specific known problematic models, use even lower limits
148156
formatted_model = self.format_model_name()
149157
if "free" in formatted_model.lower() or "air" in formatted_model.lower():
150158
# Free models often have quality issues with large contexts
151-
return min(provider_token_limits.get(self.llm_provider, self.max_tokens), 2000)
159+
return min(base_tokens, 8000)
152160

153-
return provider_token_limits.get(self.llm_provider, self.max_tokens)
161+
return base_tokens
154162

155163

156164
# Provider-specific model configurations

src/veritascribe/pdf_processor.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -159,29 +159,27 @@ def _fix_german_umlauts(self, text: str) -> str:
159159

160160
# Replace '"a' with 'ä' and '"A' with 'Ä'
161161
text = re.sub(r'"a', 'ä', text)
162+
text = re.sub(r'¨a', 'ä', text)
163+
text = re.sub(r'“a', 'ä', text)
162164
text = re.sub(r'"A', 'Ä', text)
165+
text = re.sub(r'¨A', 'Ä', text)
166+
text = re.sub(r'“A', 'Ä', text)
163167

164168
# Replace '"o' with 'ö' and '"O' with 'Ö'
165169
text = re.sub(r'"o', 'ö', text)
170+
text = re.sub(r'¨o', 'ö', text)
171+
text = re.sub(r'“o', 'ö', text)
166172
text = re.sub(r'"O', 'Ö', text)
173+
text = re.sub(r'¨O', 'Ö', text)
174+
text = re.sub(r'“O', 'Ö', text)
167175

168176
# Replace '"u' with 'ü' and '"U' with 'Ü'
169177
text = re.sub(r'"u', 'ü', text)
170-
text = re.sub(r'"U', 'Ü', text)
171-
172-
# Replace '¨a' with 'ä' and '¨A' with 'Ä'
173-
text = re.sub(r'¨a', 'ä', text)
174-
text = re.sub(r'¨A', 'Ä', text)
175-
176-
# Replace '¨o' with 'ö' and '¨O' with 'Ö'
177-
text = re.sub(r'¨o', 'ö', text)
178-
text = re.sub(r'¨O', 'Ö', text)
179-
180-
# Replace '¨u' with 'ü' and '¨U' with 'Ü'
181178
text = re.sub(r'¨u', 'ü', text)
179+
text = re.sub(r'“u', 'ü', text)
180+
text = re.sub(r'"U', 'Ü', text)
182181
text = re.sub(r'¨U', 'Ü', text)
183-
184-
182+
text = re.sub(r'“U', 'Ü', text)
185183

186184
return text
187185

@@ -235,7 +233,7 @@ def _clean_extracted_text(self, text: str) -> str:
235233

236234
# Step 6: Fix common spacing issues
237235
text = re.sub(r'\s+([.,;:!?])', r'\1', text) # Remove space before punctuation
238-
236+
239237
# Protect URLs and email addresses to avoid inserting spaces within them
240238
protected = []
241239
def _mask(match):
@@ -388,6 +386,7 @@ def extract_bibliography_section(self, pdf_path: str) -> Optional[str]:
388386
for page_num in range(len(doc)):
389387
page = doc[page_num]
390388
page_text = page.get_text()
389+
page_text = self._clean_extracted_text(page_text)
391390

392391
# Look for bibliography section headers
393392
bib_headers = [

0 commit comments

Comments
 (0)