Skip to content

Commit 8e4e06e

Browse files
committed
Merge branch 'release-0.2'
2 parents 2562552 + 9f96270 commit 8e4e06e

24 files changed

Lines changed: 1907 additions & 72 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,4 @@ target/
5757
# Local configurations
5858
conf/baleen.yaml
5959
fixtures/corpus
60+
fixtures/ddl-feedly.opml

baleen/config.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ class BaleenConfiguration(confire.Configuration):
3838
Meaningful defaults and required configurations.
3939
4040
debug: the app will print or log debug statements
41-
testing: the app will not overwrite important resources
4241
database: connection information for mongo
4342
"""
4443

@@ -48,10 +47,11 @@ class BaleenConfiguration(confire.Configuration):
4847
os.path.abspath("conf/baleen.yaml"), # Local configuration
4948
]
5049

51-
debug = True
52-
testing = True
53-
database = MongoConfiguration()
54-
50+
debug = True
51+
database = MongoConfiguration()
52+
logfile = 'baleen.log' # Location to write log
53+
loglevel = 'DEBUG' # Log messages to record
54+
fetch_html = True # Actually fetch HTML link
5555

5656
## Load settings immediately for import
5757
settings = BaleenConfiguration.load()

baleen/feed.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,16 @@
2222
## Imports
2323
##########################################################################
2424

25+
import requests
2526
import feedparser
2627
import baleen.models as db
2728

2829
from copy import copy
2930
from baleen.opml import OPML
3031
from collections import Counter
31-
from baleen.utils import localnow
32-
from baleen.logger import IngestLogger
32+
from baleen.config import settings
33+
from baleen.utils.timez import localnow
34+
from baleen.utils.logger import IngestLogger
3335
from dateutil import parser as dtparser
3436

3537
##########################################################################
@@ -51,8 +53,10 @@ def get_feed_urls(self):
5153
for url in self.feed_urls:
5254
yield url
5355
else:
54-
raise NotImplementedError(("Subclasses must either provide a list of",
55-
" feed_urls or override get_feed_urls."))
56+
raise NotImplementedError((
57+
"Subclasses must either provide a list of "
58+
"feed_urls or override get_feed_urls."
59+
))
5660

5761
def feeds(self):
5862
"""
@@ -131,8 +135,35 @@ def wrangle(self, entry):
131135
post['mimetype'] = selected.get('type')
132136
post['content'] = selected.get('value')
133137

138+
## Fetch the content if requested.
139+
if settings.fetch_html:
140+
page = self.fetch(post.get('url'))
141+
if page:
142+
post['content'] = page
143+
134144
return post
135145

146+
def fetch(self, url):
147+
"""
148+
Fetches the given url and returns the content, capturing errors.
149+
"""
150+
try:
151+
response = requests.get(url)
152+
if response.status_code == 200:
153+
return response.text
154+
else:
155+
self.logger.error(
156+
"Could not fetch '{}': {} {}".format(
157+
url, response.status_code, response.reason
158+
)
159+
)
160+
except Exception as e:
161+
self.logger.error(
162+
"Could not fetch '{}': {}".format(
163+
url, str(e)
164+
)
165+
)
166+
136167
def fields(self):
137168
"""
138169
Returns a count of all the available fields in every entry.
@@ -185,7 +216,7 @@ def get_feed_urls(self):
185216
186217
NOTE: You must connect to the MongoDB before calling this method!
187218
"""
188-
for feed in db.Feed.objects.only('link'):
219+
for feed in db.Feed.objects(active=True).only('link'):
189220
yield feed.link
190221

191222
def update_feed(self, feed, result):
@@ -308,7 +339,7 @@ def __init__(self, path):
308339

309340
def get_feed_urls(self):
310341
for feed in self.opml:
311-
yield feed['link']
342+
yield feed['xmlUrl']
312343

313344
def __len__(self):
314345
return len(self.opml)

baleen/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ class Feed(me.DynamicDocument):
7171
link = me.URLField(required=True, unique=True)
7272
urls = me.DictField()
7373
category = me.StringField(required=True)
74+
active = me.BooleanField(default=True)
7475
fetched = me.DateTimeField(default=None)
7576
created = me.DateTimeField(default=datetime.now, required=True)
7677
updated = me.DateTimeField(default=datetime.now, required=True)

baleen/opml.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ def ingest(path, **kwargs):
3838
for feed in opml:
3939
feed.pop('type') # Unneeded for database
4040
feed.pop('text') # Unneeded for database
41-
feed['link'] = feed.pop('xmlurl') # Rename the XML URL
41+
feed['link'] = feed.pop('xmlUrl') # Rename the XML URL
4242
feed['urls'] = {
43-
'htmlurl': feed.pop('htmlurl') # Add htmlurl to urls
43+
'htmlurl': feed.pop('htmlUrl') # Add htmlurl to urls
4444
}
4545
feed = db.Feed(**feed) # Construct without an ObjectId
4646

@@ -68,7 +68,7 @@ def categories(self):
6868
Reads the file to capture all the categories
6969
"""
7070
with open(self.path, 'r') as data:
71-
soup = BeautifulSoup(data)
71+
soup = BeautifulSoup(data, 'xml')
7272
for topic in soup.select('body > outline'):
7373
yield topic['title']
7474

@@ -87,7 +87,7 @@ def __iter__(self):
8787
from the OPML file; also captures category data.
8888
"""
8989
with open(self.path, 'r') as data:
90-
soup = BeautifulSoup(data)
90+
soup = BeautifulSoup(data, 'xml')
9191
for topic in soup.select('body > outline'):
9292
for feed in topic.find_all('outline'):
9393
data = feed.attrs.copy()

baleen/utils/__init__.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# baleen.utils
2+
# Utilities and helpers functions for the Baleen project.
3+
#
4+
# Author: Benjamin Bengfort <bengfort@cs.umd.edu>
5+
# Created: Sun Feb 21 15:00:06 2016 -0500
6+
#
7+
# Copyright (C) 2016 University of Maryland
8+
# For license information, see LICENSE.txt
9+
#
10+
# ID: __init__.py [] benjamin@bengfort.com $
11+
12+
"""
13+
Utilities and helpers functions for the Baleen project.
14+
"""
15+
16+
##########################################################################
17+
## Imports
18+
##########################################################################
19+
20+
# For the log configuration to work
21+
from . import mongolog
Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# baleen.logger
1+
# baleen.utils.logger
22
# Logging utility for Baleen
33
#
44
# Author: Benjamin Bengfort <benjamin@bengfort.com>
@@ -19,7 +19,7 @@
1919

2020
import logging
2121
import logging.config
22-
from baleen.utils import *
22+
from baleen.utils.timez import *
2323
from baleen.config import settings
2424

2525
##########################################################################
@@ -41,32 +41,32 @@
4141
'class': 'logging.NullHandler',
4242
},
4343
'console': {
44-
'level': 'DEBUG',
44+
'level': 'WARNING',
4545
'class': 'logging.StreamHandler',
4646
'formatter': 'simple',
4747
},
4848
'logfile': {
4949
'level': 'INFO',
5050
'class': 'logging.handlers.RotatingFileHandler',
51-
'filename': 'baleen.log',
52-
'maxBytes': '16777216', # 16 MB
51+
'filename': settings.logfile,
52+
'maxBytes': '536870912', # 512 MB
5353
'formatter': 'simple',
5454
},
5555
'mongolog': {
5656
'level': 'INFO',
57-
'class': 'baleen.mongolog.MongoHandler',
57+
'class': 'baleen.utils.mongolog.MongoHandler',
5858
}
5959
},
6060
'loggers': {
6161
'baleen': {
62-
'level': 'DEBUG',
63-
'handlers': ['console'],
62+
'level': settings.loglevel,
63+
'handlers': ['logfile'],
6464
'propagagte': True,
6565
},
6666
'baleen.ingest': {
6767
'level': 'INFO',
6868
'handlers': ['logfile', 'mongolog'],
69-
'propagate': True,
69+
'propagate': False,
7070
}
7171
},
7272
}
@@ -121,10 +121,10 @@ def error(self, message, *args, **kwargs):
121121
def critical(self, message, *args, **kwargs):
122122
return self.log(logging.CRITICAL, message, *args, **kwargs)
123123

124+
124125
class IngestLogger(Logger):
125126
"""
126-
Performs logging for the coruscate process with the log options above.
127+
Performs logging for the baleen process with the log options above.
127128
"""
128129

129130
logger = logging.getLogger('baleen.ingest')
130-
Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# baleen.mongolog
1+
# baleen.utils.mongolog
22
# Handlers and formatters for logging to Mongo
33
#
44
# Author: Benjamin Bengfort <benjamin@bengfort.com>
@@ -20,12 +20,12 @@
2020
import getpass
2121
import logging
2222
import logging.config
23-
from baleen.utils import *
23+
from baleen.utils.timez import *
2424
from baleen.config import settings
2525

2626
from datetime import datetime
2727
from socket import gethostname
28-
from pymongo import Connection
28+
from pymongo import MongoClient
2929
from pymongo.errors import OperationFailure, PyMongoError
3030

3131
##########################################################################
@@ -44,7 +44,6 @@ def format(self, record):
4444

4545
## Get the dictionary ready for Mongo
4646
data = record.__dict__.copy()
47-
print data
4847

4948
## Get the log message as intended via super
5049
message = super(MongoFormatter, self).format(record)
@@ -112,7 +111,7 @@ def connect(self):
112111
Connect to the Mongo database.
113112
"""
114113
try:
115-
self.connection = Connection(host=self.host, port=self.port)
114+
self.connection = MongoClient(host=self.host, port=self.port)
116115
except PyMongoError:
117116
if self.fail_silently:
118117
return
@@ -127,7 +126,6 @@ def close(self):
127126
Close the connection to the Mongo database.
128127
"""
129128
if self.connection is not None:
130-
self.connection.disconnect()
131129
self.connection = None
132130

133131
def emit(self, record):

baleen/utils.py renamed to baleen/utils/timez.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# baleen.utils
1+
# baleen.utils.timez
22
# Utility functions for Baleen
33
#
44
# Author: Benjamin Bengfort <benjamin@bengfort.com>
@@ -62,13 +62,3 @@ def strptimez(dtstr, dtfmt):
6262
delta = timedelta(hours = offset/100)
6363
utctsp = datetime.strptime(dtstr, dtfmt) - delta
6464
return utctsp.replace(tzinfo=tzutc())
65-
66-
def dthandler(obj, dtftmt="%Y-%m-%dT%H:%M:%S"):
67-
"""
68-
JSON helper function that provides a handler for Python datetime objects,
69-
returning the ISO 8601 format.
70-
"""
71-
dthandler = None
72-
if isinstance(obj, datetime) or isinstance(obj, date):
73-
dthandler = obj.strftime(dtftmt)
74-
return dthandler

baleen/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
__version_info__ = {
2121
'major': 0,
22-
'minor': 1,
22+
'minor': 2,
2323
'micro': 0,
2424
'releaselevel': 'final',
2525
'serial': 0,

0 commit comments

Comments
 (0)