Skip to content

Commit 8669bc0

Browse files
authored
Merge pull request #2 from gambolputty/next
Next version
2 parents 8f66c1f + 351ae96 commit 8669bc0

File tree

13 files changed

+86
-40
lines changed

13 files changed

+86
-40
lines changed

.env.example

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,7 @@ PYTHON_ENV=development
22
MONGO_USER=user
33
MONGO_PASSWORD=password
44
MONGO_DB_NAME=somedb
5-
MONGO_CREATE_TEXT_INDEX=False
5+
MONGO_CREATE_TEXT_INDEX=False
6+
MONGO_OUTSIDE_PORT=1234
7+
MAX_WORKERS=4
8+
KEEP_DAYS=2

CHANGELOG.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Changelog
2+
All notable changes to this project will be documented in this file.
3+
4+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6+
7+
## [1.1.0] - 2021-02-11
8+
### Added
9+
- Three new config variables (`MONGO_OUTSIDE_PORT`, `MAX_WORKERS`, `KEEP_DAYS`). See [readme](README.md) for details.
10+
- Network name for this project (`network name`)
11+
- Changelog file
12+
### Changed
13+
- Config variables are now uppercase
14+
- Readme
15+
16+
## [1.0.0] - 2021-02-07
17+
### Added
18+
- Initial project release

HOWTOs.md

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,26 @@ Description=Newscorpus
88
After=docker.service
99
1010
[Service]
11-
User=newscorpus
11+
Restart=on-failure
12+
RestartSec=5s
13+
User=username
1214
1315
Type=oneshot
1416
RemainAfterExit=yes
17+
StandardOutput=file:/var/log/newscorpus.log
18+
StandardError=file:/var/log/newscorpus_error.log
1519
1620
WorkingDirectory=/path/to/newscorpus
17-
ExecStart=/usr/local/bin/docker-compose up -d
18-
ExecStop=/usr/local/bin/docker-compose down
21+
ExecStart=/path/to/docker-compose/docker-compose up -d
22+
ExecStop=/path/to/docker-compose/docker-compose down
1923
2024
[Install]
2125
WantedBy=multi-user.target
2226
```
2327

2428
3. Commands:
25-
1. `systemctl enable newscorpus`
26-
2. `systemctl start newscorpus`
29+
Enable the service:
30+
- `systemctl enable newscorpus`
2731

2832
Then:
2933
- `service newscorpus status`

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ Environment variables in `.env`:
2121
| MONGO_USER | MongoDB user name |
2222
| MONGO_PASSWORD | MongoDB password |
2323
| MONGO_DB_NAME | MongoDB database name |
24-
| MONGO_CREATE_TEXT_INDEX | `true` or `1` to let MongoDB create a text index (helpful for [text search](https://docs.mongodb.com/manual/text-search/)) | |
24+
| MONGO_CREATE_TEXT_INDEX | `true` or `1` to let MongoDB create a text index (helpful for [text search](https://docs.mongodb.com/manual/text-search/)) |
25+
| MONGO_OUTSIDE_PORT | Exposed MongoDB port, accessible on your host machine. |
26+
| MAX_WORKERS | Number of worker threads for the crawler. Remove for auto assignment. |
27+
| KEEP_DAYS | Discard articles older than **n** days. Default is "2". |
2528

2629
At the moment, there are no other options. If you want to change the sources being crawled, take a look at [sources.json](crawler/app/assets/sources.json).
2730

crawler/app/__main__.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,3 @@
1717
if cmd == 'crawl':
1818
from app import crawler
1919
crawler.init()
20-
# elif cmd == 'dups':
21-
# from .database.remove_duplicates import remove_duplicates
22-
# remove_duplicates()

crawler/app/config.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,23 @@
11
import os
22
from pathlib import Path
33

4-
env = os.getenv('PYTHON_ENV', 'development')
4+
ENV = os.getenv('PYTHON_ENV', 'development')
55

6-
root = Path.cwd()
7-
root = root.joinpath('app')
6+
# paths
7+
ROOT_PATH = Path.cwd()
8+
ROOT_PATH = ROOT_PATH.joinpath('app')
89

9-
sources_path = root.joinpath('assets', 'sources.json')
10-
assets_dir_path = root.joinpath('assets')
11-
logs_dir_path = root.joinpath('logs')
10+
SOURCES_PATH = ROOT_PATH.joinpath('assets', 'sources.json')
11+
LOGS_PATH = ROOT_PATH.joinpath('logs')
12+
13+
# general
14+
MAX_WORKERS = int(os.getenv('MAX_WORKERS', '0')) or None
15+
KEEP_DAYS = int(os.getenv('KEEP_DAYS', '2'))
1216

1317
# MongoDB
14-
mongo_host = 'mongo'
15-
mongo_port = 27017
16-
mongo_username = os.getenv('MONGO_USER', 'admin')
17-
mongo_password = os.getenv('MONGO_PASSWORD', '')
18-
mongo_dbname = os.getenv("MONGO_DB_NAME", '')
19-
create_mongo_text_index = os.getenv("MONGO_CREATE_TEXT_INDEX", 'False').lower() in ['true', '1']
18+
MONGO_HOST = 'mongo'
19+
MONGO_PORT = 27017
20+
MONGO_USERNAME = os.getenv('MONGO_USER', 'admin')
21+
MONGO_PASSWORD = os.getenv('MONGO_PASSWORD', '')
22+
MONGO_DB_NAME = os.getenv("MONGO_DB_NAME", '')
23+
MONGO_CREATE_TEXT_INDEX = os.getenv("MONGO_CREATE_TEXT_INDEX", 'False').lower() in ['true', '1']

crawler/app/crawler/__init__.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,23 +16,25 @@
1616
from app.crawler.random_user_agent import random_user_agent
1717

1818

19-
create_rotating_log(config.logs_dir_path.joinpath("crawlerlog"))
19+
create_rotating_log(config.LOGS_PATH.joinpath("crawlerlog"))
2020
logger = logging.getLogger("rotating_log")
2121

2222
MIN_TEXT_LENGTH = 350
23-
DISCARD_OLDER_THAN_DAYS = 2
24-
MAX_WORKERS = 4
2523

2624

2725
def init():
2826
ts = time()
2927
logger.info('Downloading new articles')
28+
logger.info(f'Ignoring articles older than {config.KEEP_DAYS} days')
29+
30+
if config.MAX_WORKERS:
31+
logger.info(f'Maximum crawler workers: {config.MAX_WORKERS}')
3032

3133
# load sources
32-
with open(config.sources_path, encoding='utf-8') as f:
34+
with open(config.SOURCES_PATH, encoding='utf-8') as f:
3335
sources = json.load(f)
3436

35-
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
37+
with ThreadPoolExecutor(max_workers=config.MAX_WORKERS) as executor:
3638
executor.map(scrape_source, sources)
3739

3840
logger.info(f"Downloading done in {time() - ts}")
@@ -107,9 +109,9 @@ def process_feed_item(feed_item, source, articles_in_memory, db):
107109

108110
# date must be withing last n days
109111
difference = datetime.now() - published_at
110-
if difference.days > DISCARD_OLDER_THAN_DAYS:
112+
if difference.days > config.KEEP_DAYS:
111113
logger.debug(
112-
f'Skip: Article older than {DISCARD_OLDER_THAN_DAYS} days ({published_at})'
114+
f'Skip: Article older than {config.KEEP_DAYS} days ({published_at})'
113115
)
114116
return False
115117

crawler/app/database/create_client.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,16 @@
44

55

66
def create_client():
7-
client = MongoClient(config.mongo_host, config.mongo_port,
8-
username=config.mongo_username, password=config.mongo_password)
7+
client = MongoClient(config.MONGO_HOST, config.MONGO_PORT,
8+
username=config.MONGO_USERNAME, password=config.MONGO_PASSWORD)
99

10-
db = client[config.mongo_dbname]
10+
db = client[config.MONGO_DB_NAME]
1111

1212
# create indexes
1313
db.articles.create_index('url', unique=True)
1414
db.articles.create_index([('created_at', DESCENDING)])
1515

16-
if config.create_mongo_text_index is True:
16+
if config.MONGO_CREATE_TEXT_INDEX is True:
1717
db.articles.create_index([('text', TEXT)], default_language='german')
1818

1919
return [client, db]

crawler/app/logger.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,16 @@
88
# logger examples: https://www.programcreek.com/python/example/1475/logging.handlers.RotatingFileHandler
99

1010
# create log folder if not exist
11-
if os.path.isdir(config.logs_dir_path) is False:
12-
os.mkdir(config.logs_dir_path)
11+
if os.path.isdir(config.LOGS_PATH) is False:
12+
os.mkdir(config.LOGS_PATH)
1313

1414

1515
def create_rotating_log(path):
1616
"""
1717
Creates a rotating log
1818
"""
1919
logger = logging.getLogger('rotating_log')
20-
logger.setLevel(logging.DEBUG if config.env == 'development' else logging.INFO)
20+
logger.setLevel(logging.DEBUG if config.ENV == 'development' else logging.INFO)
2121
rotation_handler = RotatingFileHandler(path, maxBytes=1000000, backupCount=4, encoding='utf-8')
2222
rotation_handler.setFormatter(logging.Formatter('%(levelname)s:%(asctime)s: %(message)s'))
2323
logger.addHandler(rotation_handler)

db_backup.sh

100755100644
File mode changed.

0 commit comments

Comments
 (0)