Skip to content

Commit bf0e7f8

Browse files
committed
adds pdf-loader example
1 parent d610ebe commit bf0e7f8

26 files changed

+1278
-0
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
.env
2+
*.json
3+
*.db
4+
*.pdf
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# NOTE: could not get sqlite-vec to work on arm64
2+
# --platform=linux/amd64
3+
FROM python:3.12-slim
4+
5+
WORKDIR /app
6+
7+
# install pre-requisites
8+
RUN apt-get update && \
9+
apt-get install -y \
10+
supervisor \
11+
&& rm -rf /var/lib/apt/lists/*
12+
13+
14+
# install rabbitmq
15+
RUN apt-get update && \
16+
apt-get install -y rabbitmq-server && \
17+
rm -rf /var/lib/apt/lists/*
18+
19+
RUN python -m venv /venvs/app
20+
21+
RUN /venvs/app/bin/pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu
22+
23+
COPY requirements.txt /app/requirements.txt
24+
RUN /venvs/app/bin/pip install -r /app/requirements.txt
25+
26+
27+
COPY pdf-loader/ /app/pdf-loader
28+
RUN /venvs/app/bin/pip install --editable /app/pdf-loader
29+
30+
COPY settings.py /app/settings.py
31+
COPY entrypoint.py /app/entrypoint.py
32+
COPY supervisord.conf /etc/supervisord.conf
33+
34+
# create tables
35+
RUN /venvs/app/bin/python -m pdf_loader.db
36+
37+
# download models
38+
RUN /venvs/app/bin/python -m pdf_loader.background
39+
40+
EXPOSE 80
41+
42+
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisord.conf"]
43+
44+
45+
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# PDF loader
2+
3+
An app to parse and search over PDFs.
4+
5+
```sh
6+
# create the environment and install the required dependencies
7+
conda create --name pdf-loader python=3.12 rabbitmq-server -c conda-forge -y
8+
9+
# activate environment
10+
conda activate pdf-loader
11+
12+
# install torch cpu
13+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
14+
15+
16+
# install the package and requirements
17+
pip install --editable pdf-loader
18+
pip install -r requirements.txt
19+
20+
# create tables
21+
python -m pdf_loader.db
22+
23+
# start app
24+
python -m pdf_loader.app
25+
# open: http://localhost:5000
26+
27+
# to start celery
28+
rabbitmq-server
29+
30+
celery --app pdf_loader.background worker --loglevel=INFO --pool=prefork --concurrency=1
31+
```
32+
33+
With Docker:
34+
35+
```sh
36+
docker build -t pdf-loader .
37+
38+
# http://localhost:5000
39+
docker run -p 5000:80 --env-file .env pdf-loader
40+
41+
docker run -it pdf-loader bash
42+
```
43+
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from pdf_loader.app import app
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# documentation builds
2+
doc/_build
3+
4+
# vscode stuff
5+
.vscode
6+
.virtual_documents
7+
8+
# macOS files
9+
.DS_Store
10+
11+
# Byte-compiled / optimized / DLL files
12+
__pycache__/
13+
*.py[cod]
14+
*$py.class
15+
16+
# C extensions
17+
*.so
18+
19+
# Distribution / packaging
20+
.Python
21+
build/
22+
develop-eggs/
23+
dist/
24+
downloads/
25+
eggs/
26+
.eggs/
27+
lib/
28+
lib64/
29+
parts/
30+
sdist/
31+
var/
32+
wheels/
33+
pip-wheel-metadata/
34+
share/python-wheels/
35+
*.egg-info/
36+
.installed.cfg
37+
*.egg
38+
MANIFEST
39+
40+
# PyInstaller
41+
# Usually these files are written by a python script from a template
42+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
43+
*.manifest
44+
*.spec
45+
46+
# Installer logs
47+
pip-log.txt
48+
pip-delete-this-directory.txt
49+
50+
# Unit test / coverage reports
51+
htmlcov/
52+
.tox/
53+
.nox/
54+
.coverage
55+
.coverage.*
56+
.cache
57+
nosetests.xml
58+
coverage.xml
59+
*.cover
60+
*.py,cover
61+
.hypothesis/
62+
.pytest_cache/
63+
64+
# Translations
65+
*.mo
66+
*.pot
67+
68+
# Django stuff:
69+
*.log
70+
local_settings.py
71+
db.sqlite3
72+
db.sqlite3-journal
73+
74+
# Flask stuff:
75+
instance/
76+
.webassets-cache
77+
78+
# Scrapy stuff:
79+
.scrapy
80+
81+
# Sphinx documentation
82+
docs/_build/
83+
84+
# PyBuilder
85+
target/
86+
87+
# Jupyter Notebook
88+
.ipynb_checkpoints
89+
90+
# IPython
91+
profile_default/
92+
ipython_config.py
93+
94+
# pyenv
95+
.python-version
96+
97+
# pipenv
98+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
99+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
100+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
101+
# install all needed dependencies.
102+
#Pipfile.lock
103+
104+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
105+
__pypackages__/
106+
107+
# Celery stuff
108+
celerybeat-schedule
109+
celerybeat.pid
110+
111+
# SageMath parsed files
112+
*.sage.py
113+
114+
# Environments
115+
.env
116+
.venv
117+
env/
118+
venv/
119+
ENV/
120+
env.bak/
121+
venv.bak/
122+
123+
# Spyder project settings
124+
.spyderproject
125+
.spyproject
126+
127+
# Rope project settings
128+
.ropeproject
129+
130+
# mkdocs documentation
131+
/site
132+
133+
# mypy
134+
.mypy_cache/
135+
.dmypy.json
136+
dmypy.json
137+
138+
# Pyre type checker
139+
.pyre/
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# use this file if you wish to include non-python files
2+
# reference: https://packaging.python.org/en/latest/guides/using-manifest-in/
3+
include CHANGELOG.md
4+
include README.md
5+
graft src/pdf_loader/assets
6+
global-exclude __pycache__
7+
global-exclude *.py[co]
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[tool.pytest.ini_options]
2+
addopts = "--pdbcls=IPython.terminal.debugger:Pdb"
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[metadata]
2+
description-file = README.md
3+
4+
[flake8]
5+
max-line-length = 88
6+
extend-ignore = E203
7+
extend-exclude = build,node_modules
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import re
2+
import ast
3+
from glob import glob
4+
from os.path import basename, splitext
5+
6+
from setuptools import find_packages
7+
from setuptools import setup
8+
9+
_version_re = re.compile(r"__version__\s+=\s+(.*)")
10+
11+
with open("src/pdf_loader/__init__.py", "rb") as f:
12+
VERSION = str(
13+
ast.literal_eval(_version_re.search(f.read().decode("utf-8")).group(1))
14+
)
15+
16+
REQUIRES = []
17+
18+
DEV = [
19+
"pytest",
20+
"flake8",
21+
"invoke",
22+
"twine",
23+
]
24+
25+
setup(
26+
name="pdf-loader",
27+
version=VERSION,
28+
description=None,
29+
license=None,
30+
author=None,
31+
author_email=None,
32+
url=None,
33+
packages=find_packages("src"),
34+
package_dir={"": "src"},
35+
py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")],
36+
include_package_data=True,
37+
classifiers=[],
38+
keywords=[],
39+
install_requires=REQUIRES,
40+
extras_require={
41+
"dev": DEV,
42+
},
43+
entry_points={
44+
# 'console_scripts': ['pdf-loader=pdf_loader.cli:cli'],
45+
},
46+
)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from pdf_loader._settings import Settings
2+
3+
SETTINGS = Settings()
4+
5+
__version__ = "0.1dev"

0 commit comments

Comments
 (0)