summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordirkf <fieldhouse@gmx.net>2024-01-27 18:17:09 +0000
committerdirkf <fieldhouse@gmx.net>2024-02-02 12:36:05 +0000
commit4416f82c809a81737d68875dcb201e366d58dabd (patch)
tree9492c135e64ab5d2e97505543cfcb491741a66c2
parentbdda6b81df61f52eed2502c8ae624d297d918488 (diff)
[Vbox7IE] Sanitise ld+json containing unexpected characters
* based on PR #29680 * added hack to force invoking `transform_source` * fixes #26218
-rw-r--r--youtube_dl/extractor/vbox7.py22
1 files changed, 22 insertions, 0 deletions
diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py
index c504c5311e..d114ecb074 100644
--- a/youtube_dl/extractor/vbox7.py
+++ b/youtube_dl/extractor/vbox7.py
@@ -5,6 +5,7 @@ import re
import time
from .common import InfoExtractor
+from ..compat import compat_kwargs
from ..utils import (
determine_ext,
ExtractorError,
@@ -75,6 +76,27 @@ class Vbox7IE(InfoExtractor):
if mobj:
return mobj.group('url')
+ # transform_source=None, fatal=True
+ def _parse_json(self, json_string, video_id, *args, **kwargs):
+ if '"@context"' in json_string[:30]:
+ # this is ld+json, or that's the way to bet
+ transform_source = args[0] if len(args) > 0 else kwargs.get('transform_source')
+ if not transform_source:
+
+ def fix_chars(src):
+ # fix malformed ld+json: replace raw CRLFs with escaped LFs
+ return re.sub(
+ r'"[^"]+"', lambda m: re.sub(r'\r?\n', r'\\n', m.group(0)), src)
+
+ if len(args) > 0:
+ args = (fix_chars,) + args[1:]
+ else:
+ kwargs['transform_source'] = fix_chars
+ kwargs = compat_kwargs(kwargs)
+
+ return super(Vbox7IE, self)._parse_json(
+ json_string, video_id, *args, **kwargs)
+
def _real_extract(self, url):
video_id = self._match_id(url)
url = 'https://vbox7.com/play:%s' % (video_id,)