Skip to content

Commit 665b254

Browse files
dirkfGitHub Actions
authored and
GitHub Actions
committed
[Vbox7IE] Sanitise ld+json containing unexpected characters
* based on PR #29680 * added hack to force invoking `transform_source` * fixes #26218
1 parent 7160f2f commit 665b254

File tree

1 file changed

+22
-0
lines changed

1 file changed

+22
-0
lines changed

youtube_dl/extractor/vbox7.py

+22
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import time
66

77
from .common import InfoExtractor
8+
from ..compat import compat_kwargs
89
from ..utils import (
910
determine_ext,
1011
ExtractorError,
@@ -75,6 +76,27 @@ def _extract_url(cls, webpage):
7576
if mobj:
7677
return mobj.group('url')
7778

79+
# transform_source=None, fatal=True
80+
def _parse_json(self, json_string, video_id, *args, **kwargs):
81+
if '"@context"' in json_string[:30]:
82+
# this is ld+json, or that's the way to bet
83+
transform_source = args[0] if len(args) > 0 else kwargs.get('transform_source')
84+
if not transform_source:
85+
86+
def fix_chars(src):
87+
# fix malformed ld+json: replace raw CRLFs with escaped LFs
88+
return re.sub(
89+
r'"[^"]+"', lambda m: re.sub(r'\r?\n', r'\\n', m.group(0)), src)
90+
91+
if len(args) > 0:
92+
args = (fix_chars,) + args[1:]
93+
else:
94+
kwargs['transform_source'] = fix_chars
95+
kwargs = compat_kwargs(kwargs)
96+
97+
return super(Vbox7IE, self)._parse_json(
98+
json_string, video_id, *args, **kwargs)
99+
78100
def _real_extract(self, url):
79101
video_id = self._match_id(url)
80102
url = 'https://vbox7.com/play:%s' % (video_id,)

0 commit comments

Comments
 (0)