Skip to content
This repository was archived by the owner on Nov 11, 2019. It is now read-only.

Use youtube-dl as a library #54

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 10 additions & 23 deletions steve/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
# license.
#######################################################################

import json
import subprocess
from datetime import datetime

import youtube_dl

from steve.util import is_youtube


Expand All @@ -27,7 +27,7 @@ class YoutubeScraper(object):
def transform_item(self, item):
"""Converts youtube-dl output to richard fields"""
return {
'title': item['fulltitle'],
'title': item.get('fulltitle') or item['title'],
'summary': item['description'],
'description': '',
'state': 2, # Draft
Expand All @@ -41,7 +41,7 @@ def transform_item(self, item):
'whiteboard': '',
'recorded': datetime.strptime(item['upload_date'], '%Y%m%d'),
'slug': '',
'tags': item['categories'],
'tags': item.get('categories', []) + item.get('tags', []),
'speakers': []
}

Expand All @@ -50,23 +50,10 @@ def scrape(self, url):
if not is_youtube(url):
return

# FIXME: Sometimes youtube-dl takes a *long* time to run. This
# needs to give indication of progress.
try:
output = subprocess.check_output(
['youtube-dl', '-j', url],
stderr=subprocess.STDOUT
)
except subprocess.CalledProcessError as cpe:
raise ScraperError('youtube-dl said "{0}".'.format(cpe.output))
except OSError:
raise ScraperError('youtube-dl not installed or not on PATH.')

# Each line is a single JSON object.
items = []
for line in output.splitlines():
items.append(json.loads(line))

items = [self.transform_item(item) for item in items]
with youtube_dl.YoutubeDL() as ydl:
result = ydl.extract_info(url, download=False)

return items
if 'entries' in result:
return [self.transform_item(item) for item in result['entries']]
else:
return [self.transform_item(result)]