2222## Imports
2323##########################################################################
2424
25+ import requests
2526import feedparser
2627import baleen .models as db
2728
2829from copy import copy
2930from baleen .opml import OPML
3031from collections import Counter
31- from baleen .utils import localnow
32- from baleen .logger import IngestLogger
32+ from baleen .config import settings
33+ from baleen .utils .timez import localnow
34+ from baleen .utils .logger import IngestLogger
3335from dateutil import parser as dtparser
3436
3537##########################################################################
@@ -51,8 +53,10 @@ def get_feed_urls(self):
5153 for url in self .feed_urls :
5254 yield url
5355 else :
54- raise NotImplementedError (("Subclasses must either provide a list of" ,
55- " feed_urls or override get_feed_urls." ))
56+ raise NotImplementedError ((
57+ "Subclasses must either provide a list of "
58+ "feed_urls or override get_feed_urls."
59+ ))
5660
5761 def feeds (self ):
5862 """
@@ -131,8 +135,35 @@ def wrangle(self, entry):
131135 post ['mimetype' ] = selected .get ('type' )
132136 post ['content' ] = selected .get ('value' )
133137
138+ ## Fetch the content if requested.
139+ if settings .fetch_html :
140+ page = self .fetch (post .get ('url' ))
141+ if page :
142+ post ['content' ] = page
143+
134144 return post
135145
146+ def fetch (self , url ):
147+ """
148+ Fetches the given url and returns the content, capturing errors.
149+ """
150+ try :
151+ response = requests .get (url )
152+ if response .status_code == 200 :
153+ return response .text
154+ else :
155+ self .logger .error (
156+ "Could not fetch '{}': {} {}" .format (
157+ url , response .status_code , response .reason
158+ )
159+ )
160+ except Exception as e :
161+ self .logger .error (
162+ "Could not fetch '{}': {}" .format (
163+ url , str (e )
164+ )
165+ )
166+
136167 def fields (self ):
137168 """
138169 Returns a count of all the available fields in every entry.
@@ -185,7 +216,7 @@ def get_feed_urls(self):
185216
186217 NOTE: You must connect to the MongoDB before calling this method!
187218 """
188- for feed in db .Feed .objects .only ('link' ):
219+ for feed in db .Feed .objects ( active = True ) .only ('link' ):
189220 yield feed .link
190221
191222 def update_feed (self , feed , result ):
@@ -308,7 +339,7 @@ def __init__(self, path):
308339
309340 def get_feed_urls (self ):
310341 for feed in self .opml :
311- yield feed ['link ' ]
342+ yield feed ['xmlUrl ' ]
312343
313344 def __len__ (self ):
314345 return len (self .opml )
0 commit comments