From b5242da7d24028f60cd23fd10f28fb635c7c7634 Mon Sep 17 00:00:00 2001 From: lanegramling Date: Thu, 16 Dec 2021 11:42:17 -0700 Subject: [PATCH 01/70] [youtube] Update signature function patterns (closes #30363) (#30366) --- youtube_dl/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index dc4bd4a77..62e58c13e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1323,10 +1323,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', - r'\bm=(?P[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)', - r'\bc&&\(c=(?P[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'\bm=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', + r'\bc&&\(c=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', From e41882335066ed03b1f4837e72fc0e83dfbe3525 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Dec 2021 01:43:16 +0700 Subject: [PATCH 02/70] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 680fffdf8..e530e6aea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version + +Core +* [postprocessor/ffmpeg] Show ffmpeg output on error (#22680, #29336) + +Extractors +* [youtube] Update signature function patterns (#30363, #30366) +* [peertube] Only call description endpoint if necessary (#29383) +* [periscope] Pass referer to HLS requests (#29419) +- [liveleak] Remove extractor (#17625, #24222, #29331) ++ [pornhub] Add support for pornhubthbh7ap3u.onion +* [pornhub] Detect geo restriction +* [pornhub] Dismiss tbr extracted from download URLs (#28927) +* [curiositystream:collection] Extend _VALID_URL (#26326, #29117) +* [youtube] Make get_video_info processing more robust (#29333) +* [youtube] Workaround for get_video_info request (#29333) +* [bilibili] Strip uploader name (#29202) +* [youtube] Update invidious instance list (#29281) +* [umg:de] Update GraphQL API URL (#29304) +* [nrk] Switch psapi URL to https (#29344) ++ [egghead] Add support for app.egghead.io (#28404, #29303) +* [appleconnect] Fix extraction (#29208) ++ [orf:tvthek] Add support for MPD formats (#28672, #29236) + + version 2021.06.06 Extractors From 5014bd67c22b421207b2650d4dc874b95b36dda1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Dec 2021 01:49:07 +0700 Subject: [PATCH 03/70] release 2021.12.17 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 2 -- youtube_dl/version.py | 2 +- 8 files changed, 14 insertions(+), 16 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 4eb505231..e5405c235 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.06.06 + [debug] youtube-dl version 2021.12.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 9fed0b489..33b01ce7f 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 573e8ded0..285610cc7 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index c0031bf7a..af73525fb 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.06.06 + [debug] youtube-dl version 2021.12.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 1138ab2ca..42c878b83 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index e530e6aea..658864282 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.12.17 Core * [postprocessor/ffmpeg] Show ffmpeg output on error (#22680, #29336) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ed0d5e9d9..ae2a6b8b0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -472,8 +472,6 @@ - **LinuxAcademy** - **LiTV** - **LiveJournal** - - **LiveLeak** - - **LiveLeakEmbed** - **livestream** - **livestream:original** - **LnkGo** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 461dd87ca..b82fbc702 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.06.06' +__version__ = '2021.12.17' From ed99d68bdddfba0440dc81c105d5c0ea7cee7d1c Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 30 Jan 2022 00:41:47 +0530 Subject: [PATCH 04/70] Add back `YoutubeSearchURLIE` --- test/test_all_urls.py | 6 +- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/youtube.py | 177 +++++++++++++++-------------- 3 files changed, 93 insertions(+), 92 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index df6d81b5d..0e1328ede 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -66,9 +66,9 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) - # def test_youtube_search_matching(self): - # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) - # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) + def test_youtube_search_matching(self): + self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) + self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) def test_youtube_extract(self): assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9b449937d..d403a2dbe 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1556,7 +1556,7 @@ from .youtube import ( YoutubeRecommendedIE, YoutubeSearchDateIE, YoutubeSearchIE, - #YoutubeSearchURLIE, + YoutubeSearchURLIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 87bdc1677..578cfcf90 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -308,6 +308,77 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', default='{}'), video_id, fatal=False) + def _search_results(self, query, params): + data = { + 'context': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20201021.03.00', + } + }, + 'query': query, + } + if params: + data['params'] = params + for page_num in itertools.count(1): + search = self._download_json( + 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + video_id='query "%s"' % query, + note='Downloading page %s' % page_num, + errnote='Unable to download API page', fatal=False, + data=json.dumps(data).encode('utf8'), + headers={'content-type': 'application/json'}) + if not search: + break + slr_contents = try_get( + search, + (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], + lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), + list) + if not slr_contents: + break + isr_contents = try_get( + slr_contents, + lambda x: x[0]['itemSectionRenderer']['contents'], + list) + if not isr_contents: + break + for content in isr_contents: + if not isinstance(content, dict): + continue + video = content.get('videoRenderer') + if not isinstance(video, dict): + continue + video_id = video.get('videoId') + if not video_id: + continue + title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str) + description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str) + duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str)) + view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or '' + view_count = int_or_none(self._search_regex( + r'^(\d+)', re.sub(r'\s', '', view_count_text), + 'view count', default=None)) + uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str) + yield { + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'id': video_id, + 'url': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'uploader': uploader, + } + token = try_get( + slr_contents, + lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], + compat_str) + if not token: + break + data['continuation'] = token + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' @@ -2454,7 +2525,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): (?: (?:channel|c|user|feed)/| (?:playlist|watch)\?.*?\blist=| - (?!(?:watch|embed|v|e)\b) + (?!(?:watch|embed|v|e|results)\b) ) (?P[^/?\#&]+) ''' @@ -3379,88 +3450,18 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com searches' - # there doesn't appear to be a real limit, for example if you search for - # 'python' you get more than 8.000.000 results - _MAX_RESULTS = float('inf') IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' - _SEARCH_PARAMS = None + _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only _TESTS = [] def _entries(self, query, n): - data = { - 'context': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20201021.03.00', - } - }, - 'query': query, - } - if self._SEARCH_PARAMS: - data['params'] = self._SEARCH_PARAMS total = 0 - for page_num in itertools.count(1): - search = self._download_json( - 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', - video_id='query "%s"' % query, - note='Downloading page %s' % page_num, - errnote='Unable to download API page', fatal=False, - data=json.dumps(data).encode('utf8'), - headers={'content-type': 'application/json'}) - if not search: - break - slr_contents = try_get( - search, - (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], - lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), - list) - if not slr_contents: - break - isr_contents = try_get( - slr_contents, - lambda x: x[0]['itemSectionRenderer']['contents'], - list) - if not isr_contents: - break - for content in isr_contents: - if not isinstance(content, dict): - continue - video = content.get('videoRenderer') - if not isinstance(video, dict): - continue - video_id = video.get('videoId') - if not video_id: - continue - title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str) - description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str) - duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str)) - view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or '' - view_count = int_or_none(self._search_regex( - r'^(\d+)', re.sub(r'\s', '', view_count_text), - 'view count', default=None)) - uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str) - total += 1 - yield { - '_type': 'url_transparent', - 'ie_key': YoutubeIE.ie_key(), - 'id': video_id, - 'url': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'uploader': uploader, - } - if total == n: - return - token = try_get( - slr_contents, - lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], - compat_str) - if not token: - break - data['continuation'] = token + for entry in self._search_results(query, self._SEARCH_PARAMS): + yield entry + total += 1 + if total >= n: + return def _get_n_results(self, query, n): """Get a specified number of results for a query""" @@ -3471,18 +3472,19 @@ class YoutubeSearchDateIE(YoutubeSearchIE): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _SEARCH_KEY = 'ytsearchdate' IE_DESC = 'YouTube.com searches, newest videos first' - _SEARCH_PARAMS = 'CAI%3D' + _SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date + _TESTS = [] -r""" -class YoutubeSearchURLIE(YoutubeSearchIE): - IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P[^&]+)(?:[&]|$)' +class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube search URLs with sorting and filter support' + IE_NAME = YoutubeSearchIE.IE_NAME + '_url' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, 'info_dict': { + 'id': 'youtube-dl test video', 'title': 'youtube-dl test video', } }, { @@ -3491,11 +3493,10 @@ class YoutubeSearchURLIE(YoutubeSearchIE): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - return self.playlist_result(self._process_page(webpage), playlist_title=query) -""" + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + query = (qs.get('search_query') or qs.get('q'))[0] + params = qs.get('sp', ('',))[0] + return self.playlist_result(self._search_results(query, params), query, query) class YoutubeFeedsInfoExtractor(YoutubeTabIE): From bfe72723d8318f8bfcb35dee69a40758df5fa3c0 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 30 Jan 2022 00:49:55 +0530 Subject: [PATCH 05/70] Use `itertools.islice` --- youtube_dl/extractor/youtube.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 578cfcf90..017837e10 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3455,17 +3455,10 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only _TESTS = [] - def _entries(self, query, n): - total = 0 - for entry in self._search_results(query, self._SEARCH_PARAMS): - yield entry - total += 1 - if total >= n: - return - def _get_n_results(self, query, n): """Get a specified number of results for a query""" - return self.playlist_result(self._entries(query, n), query) + entries = itertools.islice(self._search_results(query, self._SEARCH_PARAMS), 0, None if n == float('inf') else n) + return self.playlist_result(entries, query, query) class YoutubeSearchDateIE(YoutubeSearchIE): From 2c4cb134a90b49a4d44965b57ff43cfd45ec2d69 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 30 Jan 2022 00:54:22 +0530 Subject: [PATCH 06/70] Fix max_results --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 017837e10..bbd3e80d8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3453,6 +3453,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only + _MAX_RESULTS = float('inf') _TESTS = [] def _get_n_results(self, query, n): From 57044eacebc6f2f3cd83c345e1b6e659a22e4773 Mon Sep 17 00:00:00 2001 From: df Date: Thu, 28 Oct 2021 15:55:38 +0100 Subject: [PATCH 07/70] Fix test_youtube_playlist_noplaylist --- test/test_youtube_lists.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index cf2fdf14f..72820972e 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- from __future__ import unicode_literals # Allow direct execution @@ -9,11 +10,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL - from youtube_dl.extractor import ( + YoutubeIE, YoutubePlaylistIE, YoutubeTabIE, - YoutubeIE, ) @@ -25,9 +25,11 @@ class TestYoutubeLists(unittest.TestCase): def test_youtube_playlist_noplaylist(self): dl = FakeYDL() dl.params['noplaylist'] = True + dl.params['format'] = 'best' ie = YoutubePlaylistIE(dl) result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') self.assertEqual(result['_type'], 'url') + result = dl.extract_info(result['url'], download=False, ie_key=result.get('ie_key'), process=False) self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg') def test_youtube_course(self): From 46e0a729b2d4503d8d49433fdddfce726d08261e Mon Sep 17 00:00:00 2001 From: df Date: Thu, 28 Oct 2021 15:57:10 +0100 Subject: [PATCH 08/70] Remove obsolete test_youtube_course --- test/test_youtube_lists.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 72820972e..e1636a1a6 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -32,16 +32,6 @@ class TestYoutubeLists(unittest.TestCase): result = dl.extract_info(result['url'], download=False, ie_key=result.get('ie_key'), process=False) self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg') - def test_youtube_course(self): - dl = FakeYDL() - ie = YoutubePlaylistIE(dl) - # TODO find a > 100 (paginating?) videos course - result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') - entries = list(result['entries']) - self.assertEqual(YoutubeIE().extract_id(entries[0]['url']), 'j9WZyLZCBzs') - self.assertEqual(len(entries), 25) - self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0') - def test_youtube_mix(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) From 2c2c2bd348b7dce0aad55a6fc37a18c6f9a000e3 Mon Sep 17 00:00:00 2001 From: df Date: Fri, 29 Oct 2021 03:03:00 +0100 Subject: [PATCH 09/70] Fix test_youtube_mix --- test/test_youtube_lists.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index e1636a1a6..fae8a950a 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -34,12 +34,14 @@ class TestYoutubeLists(unittest.TestCase): def test_youtube_mix(self): dl = FakeYDL() - ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w') - entries = result['entries'] + dl.params['format'] = 'best' + ie = YoutubeTabIE(dl) + result = dl.extract_info('https://www.youtube.com/watch?v=uVJ0Il5WvbE&list=PLhQjrBD2T381k8ul4WQ8SQ165XqY149WW', + download=False, ie_key=ie.ie_key(), process=True) + entries = (result or {}).get('entries', [{'id': 'not_found', }]) self.assertTrue(len(entries) >= 50) original_video = entries[0] - self.assertEqual(original_video['id'], 'OQpdSVF_k_w') + self.assertEqual(original_video['id'], 'uVJ0Il5WvbE') def test_youtube_toptracks(self): print('Skipping: The playlist page gives error 500') From d76d59d99d05fba94963690a039d38373dddc658 Mon Sep 17 00:00:00 2001 From: df Date: Fri, 29 Oct 2021 03:10:35 +0100 Subject: [PATCH 10/70] Remove obsolete non-working test_youtube_toptracks --- test/test_youtube_lists.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index fae8a950a..69c5d52eb 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -43,15 +43,6 @@ class TestYoutubeLists(unittest.TestCase): original_video = entries[0] self.assertEqual(original_video['id'], 'uVJ0Il5WvbE') - def test_youtube_toptracks(self): - print('Skipping: The playlist page gives error 500') - return - dl = FakeYDL() - ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=MCUS') - entries = result['entries'] - self.assertEqual(len(entries), 100) - def test_youtube_flat_playlist_extraction(self): dl = FakeYDL() dl.params['extract_flat'] = True From 39ca35e7651048c2adf558f1d6db2df0de4554f5 Mon Sep 17 00:00:00 2001 From: df Date: Mon, 1 Nov 2021 04:44:57 +0000 Subject: [PATCH 11/70] Fix test_youtube_flat_playlist_extraction --- test/test_youtube_lists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 69c5d52eb..07a6b6d06 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -52,7 +52,7 @@ class TestYoutubeLists(unittest.TestCase): entries = list(result['entries']) self.assertTrue(len(entries) == 1) video = entries[0] - self.assertEqual(video['_type'], 'url_transparent') + self.assertEqual(video['_type'], 'url') self.assertEqual(video['ie_key'], 'Youtube') self.assertEqual(video['id'], 'BaW_jenozKc') self.assertEqual(video['url'], 'BaW_jenozKc') From 5f5de51a499f732a6e687f32037e130cbdc50c8f Mon Sep 17 00:00:00 2001 From: df Date: Mon, 1 Nov 2021 13:34:29 +0000 Subject: [PATCH 12/70] Add compat_map/filter and use the former --- youtube_dl/compat.py | 21 +++++++++++++++++++++ youtube_dl/extractor/youtube.py | 1 + 2 files changed, 22 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 9e45c454b..29e0d3a02 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2962,6 +2962,25 @@ else: compat_Struct = struct.Struct +# compat_map/filter() returning an iterator, supposedly the +# same versioning as for zip below +try: + from future_builtins import map as compat_map +except ImportError: + try: + from itertools import imap as compat_map + except ImportError: + compat_map = map + +try: + from future_builtins import filter as compat_filter +except ImportError: + try: + from itertools import ifilter as compat_filter + except ImportError: + compat_filter = filter + + try: from future_builtins import zip as compat_zip except ImportError: # not 2.6+ or is 3.x @@ -3015,6 +3034,7 @@ __all__ = [ 'compat_etree_fromstring', 'compat_etree_register_namespace', 'compat_expanduser', + 'compat_filter', 'compat_get_terminal_size', 'compat_getenv', 'compat_getpass', @@ -3026,6 +3046,7 @@ __all__ = [ 'compat_integer_types', 'compat_itertools_count', 'compat_kwargs', + 'compat_map', 'compat_numeric_types', 'compat_ord', 'compat_os_name', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 62e58c13e..da410f8f0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -13,6 +13,7 @@ from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_chr, compat_HTTPError, + compat_map as map, compat_parse_qs, compat_str, compat_urllib_parse_unquote_plus, From 96f87aaa3b34d80bc72097a7475d8093849091fc Mon Sep 17 00:00:00 2001 From: df Date: Tue, 2 Nov 2021 11:18:39 +0000 Subject: [PATCH 13/70] Back-port JS interpreter upgrade from yt-dlp PR #1437 --- test/test_jsinterp.py | 51 +++++ youtube_dl/compat.py | 5 + youtube_dl/jsinterp.py | 496 ++++++++++++++++++++++++++++++++--------- 3 files changed, 449 insertions(+), 103 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index c24b8ca74..4d05ea610 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -112,6 +112,57 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('z'), 5) + def test_for_loop(self): + # function x() { a=0; for (i=0; i-10; i++) {a++} a } + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i = i + 1) {a++} a } + ''') + self.assertEqual(jsi.call_function('x'), 10) + + def test_switch(self): + jsi = JSInterpreter(''' + function x(f) { switch(f){ + case 1:f+=1; + case 2:f+=2; + case 3:f+=3;break; + case 4:f+=4; + default:f=0; + } return f } + ''') + self.assertEqual(jsi.call_function('x', 1), 7) + self.assertEqual(jsi.call_function('x', 3), 6) + self.assertEqual(jsi.call_function('x', 5), 0) + + def test_try(self): + jsi = JSInterpreter(''' + function x() { try{return 10} catch(e){return 5} } + ''') + self.assertEqual(jsi.call_function('x'), 10) + + def test_for_loop_continue(self): + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) { continue; a++ } a } + ''') + self.assertEqual(jsi.call_function('x'), 0) + + def test_for_loop_break(self): + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) { break; a++ } a } + ''') + self.assertEqual(jsi.call_function('x'), 0) + + def test_literal_list(self): + jsi = JSInterpreter(''' + function x() { [1, 2, "asdf", [5, 6, 7]][3] } + ''') + self.assertEqual(jsi.call_function('x'), [5, 6, 7]) + + def test_comma(self): + jsi = JSInterpreter(''' + function x() { a=5; a -= 1, a+=3; return a } + ''') + self.assertEqual(jsi.call_function('x'), 7) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 29e0d3a02..2004a405a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -21,6 +21,10 @@ import subprocess import sys import xml.etree.ElementTree +try: + import collections.abc as compat_collections_abc +except ImportError: + import collections as compat_collections_abc try: import urllib.request as compat_urllib_request @@ -3025,6 +3029,7 @@ __all__ = [ 'compat_b64decode', 'compat_basestring', 'compat_chr', + 'compat_collections_abc', 'compat_cookiejar', 'compat_cookiejar_Cookie', 'compat_cookies', diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 7bda59610..061e92c2a 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -8,6 +8,15 @@ from .utils import ( ExtractorError, remove_quotes, ) +from .compat import ( + compat_collections_abc +) +MutableMapping = compat_collections_abc.MutableMapping + + +class Nonlocal: + pass + _OPERATORS = [ ('|', operator.or_), @@ -22,11 +31,55 @@ _OPERATORS = [ ('*', operator.mul), ] _ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS] -_ASSIGN_OPERATORS.append(('=', lambda cur, right: right)) +_ASSIGN_OPERATORS.append(('=', (lambda cur, right: right))) _NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' +class JS_Break(ExtractorError): + def __init__(self): + ExtractorError.__init__(self, 'Invalid break') + + +class JS_Continue(ExtractorError): + def __init__(self): + ExtractorError.__init__(self, 'Invalid continue') + + +class LocalNameSpace(MutableMapping): + def __init__(self, *stack): + self.stack = tuple(stack) + + def __getitem__(self, key): + for scope in self.stack: + if key in scope: + return scope[key] + raise KeyError(key) + + def __setitem__(self, key, value): + for scope in self.stack: + if key in scope: + scope[key] = value + break + else: + self.stack[0][key] = value + return value + + def __delitem__(self, key): + raise NotImplementedError('Deleting is not supported') + + def __iter__(self): + for scope in self.stack: + for scope_item in iter(scope): + yield scope_item + + def __len__(self, key): + return len(iter(self)) + + def __repr__(self): + return 'LocalNameSpace%s' % (self.stack, ) + + class JSInterpreter(object): def __init__(self, code, objects=None): if objects is None: @@ -34,11 +87,58 @@ class JSInterpreter(object): self.code = code self._functions = {} self._objects = objects + self.__named_object_counter = 0 + + def _named_object(self, namespace, obj): + self.__named_object_counter += 1 + name = '__youtube_dl_jsinterp_obj%s' % (self.__named_object_counter, ) + namespace[name] = obj + return name + + @staticmethod + def _separate(expr, delim=',', max_split=None): + if not expr: + return + parens = {'(': 0, '{': 0, '[': 0, ']': 0, '}': 0, ')': 0} + start, splits, pos, max_pos = 0, 0, 0, len(delim) - 1 + for idx, char in enumerate(expr): + if char in parens: + parens[char] += 1 + is_in_parens = (parens['['] - parens[']'] + or parens['('] - parens[')'] + or parens['{'] - parens['}']) + if char == delim[pos] and not is_in_parens: + if pos == max_pos: + pos = 0 + yield expr[start: idx - max_pos] + start = idx + 1 + splits += 1 + if max_split and splits >= max_split: + break + else: + pos += 1 + else: + pos = 0 + yield expr[start:] + + @staticmethod + def _separate_at_paren(expr, delim): + separated = list(JSInterpreter._separate(expr, delim, 1)) + if len(separated) < 2: + raise ExtractorError('No terminating paren {0} in {1}'.format(delim, expr)) + return separated[0][1:].strip(), separated[1].strip() def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: raise ExtractorError('Recursion limit reached') + sub_statements = list(self._separate(stmt, ';')) + stmt = (sub_statements or ['']).pop() + for sub_stmt in sub_statements: + ret, should_abort = self.interpret_statement(sub_stmt, local_vars, allow_recursion - 1) + if should_abort: + return ret + should_abort = False stmt = stmt.lstrip() stmt_m = re.match(r'var\s', stmt) @@ -61,25 +161,119 @@ class JSInterpreter(object): if expr == '': # Empty expression return None - if expr.startswith('('): - parens_count = 0 - for m in re.finditer(r'[()]', expr): - if m.group(0) == '(': - parens_count += 1 - else: - parens_count -= 1 - if parens_count == 0: - sub_expr = expr[1:m.start()] - sub_result = self.interpret_expression( - sub_expr, local_vars, allow_recursion) - remaining_expr = expr[m.end():].strip() - if not remaining_expr: - return sub_result - else: - expr = json.dumps(sub_result) + remaining_expr - break + if expr.startswith('{'): + inner, outer = self._separate_at_paren(expr, '}') + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion - 1) + if not outer or should_abort: + return inner else: - raise ExtractorError('Premature end of parens in %r' % expr) + expr = json.dumps(inner) + outer + + if expr.startswith('('): + inner, outer = self._separate_at_paren(expr, ')') + inner = self.interpret_expression(inner, local_vars, allow_recursion) + if not outer: + return inner + else: + expr = json.dumps(inner) + outer + + if expr.startswith('['): + inner, outer = self._separate_at_paren(expr, ']') + name = self._named_object(local_vars, [ + self.interpret_expression(item, local_vars, allow_recursion) + for item in self._separate(inner)]) + expr = name + outer + + m = re.match(r'try\s*', expr) + if m: + if expr[m.end()] == '{': + try_expr, expr = self._separate_at_paren(expr[m.end():], '}') + else: + try_expr, expr = expr[m.end() - 1:], '' + ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion - 1) + if should_abort: + return ret + return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + m = re.match(r'(?:(?Pcatch)|(?Pfor)|(?Pswitch))\s*\(', expr) + md = m.groupdict() if m else {} + if md.get('catch'): + # We ignore the catch block + _, expr = self._separate_at_paren(expr, '}') + return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + elif md.get('for'): + def raise_constructor_error(c): + raise ExtractorError( + 'Premature return in the initialization of a for loop in {0!r}'.format(c)) + + constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') + if remaining.startswith('{'): + body, expr = self._separate_at_paren(remaining, '}') + else: + m = re.match(r'switch\s*\(', remaining) # FIXME + if m: + switch_val, remaining = self._separate_at_paren(remaining[m.end() - 1:], ')') + body, expr = self._separate_at_paren(remaining, '}') + body = 'switch(%s){%s}' % (switch_val, body) + else: + body, expr = remaining, '' + start, cndn, increment = self._separate(constructor, ';') + if self.interpret_statement(start, local_vars, allow_recursion - 1)[1]: + raise_constructor_error(constructor) + while True: + if not self.interpret_expression(cndn, local_vars, allow_recursion): + break + try: + ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion - 1) + if should_abort: + return ret + except JS_Break: + break + except JS_Continue: + pass + if self.interpret_statement(increment, local_vars, allow_recursion - 1)[1]: + raise_constructor_error(constructor) + return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + elif md.get('switch'): + switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') + switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion) + body, expr = self._separate_at_paren(remaining, '}') + body, default = body.split('default:') if 'default:' in body else (body, None) + items = body.split('case ')[1:] + if default: + items.append('default:%s' % (default, )) + matched = False + for item in items: + case, stmt = [i.strip() for i in self._separate(item, ':', 1)] + matched = matched or case == 'default' or switch_val == self.interpret_expression(case, local_vars, allow_recursion) + if matched: + try: + ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion - 1) + if should_abort: + return ret + except JS_Break: + break + return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + # Comma separated statements + sub_expressions = list(self._separate(expr)) + expr = sub_expressions.pop().strip() if sub_expressions else '' + for sub_expr in sub_expressions: + self.interpret_expression(sub_expr, local_vars, allow_recursion) + + for m in re.finditer(r'''(?x) + (?P\+\+|--)(?P%(_NAME_RE)s)| + (?P%(_NAME_RE)s)(?P\+\+|--)''' % globals(), expr): + var = m.group('var1') or m.group('var2') + start, end = m.span() + sign = m.group('pre_sign') or m.group('post_sign') + ret = local_vars[var] + local_vars[var] += 1 if sign[0] == '+' else -1 + if m.group('pre_sign'): + ret = local_vars[var] + expr = expr[:start] + json.dumps(ret) + expr[end:] for op, opfunc in _ASSIGN_OPERATORS: m = re.match(r'''(?x) @@ -88,14 +282,13 @@ class JSInterpreter(object): (?P.*)$''' % (_NAME_RE, re.escape(op)), expr) if not m: continue - right_val = self.interpret_expression( - m.group('expr'), local_vars, allow_recursion - 1) + right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion) if m.groupdict().get('index'): lvar = local_vars[m.group('out')] - idx = self.interpret_expression( - m.group('index'), local_vars, allow_recursion) - assert isinstance(idx, int) + idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) + if not isinstance(idx, int): + raise ExtractorError('List indices must be integers: %s' % (idx, )) cur = lvar[idx] val = opfunc(cur, right_val) lvar[idx] = val @@ -109,8 +302,13 @@ class JSInterpreter(object): if expr.isdigit(): return int(expr) + if expr == 'break': + raise JS_Break() + elif expr == 'continue': + raise JS_Continue() + var_m = re.match( - r'(?!if|return|true|false)(?P%s)$' % _NAME_RE, + r'(?!if|return|true|false|null)(?P%s)$' % _NAME_RE, expr) if var_m: return local_vars[var_m.group('name')] @@ -124,91 +322,161 @@ class JSInterpreter(object): r'(?P%s)\[(?P.+)\]$' % _NAME_RE, expr) if m: val = local_vars[m.group('in')] - idx = self.interpret_expression( - m.group('idx'), local_vars, allow_recursion - 1) + idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion) return val[idx] + def raise_expr_error(where, op, exp): + raise ExtractorError('Premature {0} return of {1} in {2!r}'.format(where, op, exp)) + + for op, opfunc in _OPERATORS: + separated = list(self._separate(expr, op)) + if len(separated) < 2: + continue + right_val = separated.pop() + left_val = op.join(separated) + left_val, should_abort = self.interpret_statement( + left_val, local_vars, allow_recursion - 1) + if should_abort: + raise_expr_error('left-side', op, expr) + right_val, should_abort = self.interpret_statement( + right_val, local_vars, allow_recursion - 1) + if should_abort: + raise_expr_error('right-side', op, expr) + return opfunc(left_val or 0, right_val) + m = re.match( - r'(?P%s)(?:\.(?P[^(]+)|\[(?P[^]]+)\])\s*(?:\(+(?P[^()]*)\))?$' % _NAME_RE, + r'(?P%s)(?:\.(?P[^(]+)|\[(?P[^]]+)\])\s*' % _NAME_RE, expr) if m: variable = m.group('var') - member = remove_quotes(m.group('member') or m.group('member2')) - arg_str = m.group('args') + nl = Nonlocal() - if variable in local_vars: - obj = local_vars[variable] + nl.member = remove_quotes(m.group('member') or m.group('member2')) + arg_str = expr[m.end():] + if arg_str.startswith('('): + arg_str, remaining = self._separate_at_paren(arg_str, ')') else: - if variable not in self._objects: - self._objects[variable] = self.extract_object(variable) - obj = self._objects[variable] + arg_str, remaining = None, arg_str - if arg_str is None: - # Member access - if member == 'length': - return len(obj) - return obj[member] + def assertion(cndn, msg): + """ assert, but without risk of getting optimized out """ + if not cndn: + raise ExtractorError('{0} {1}: {2}'.format(nl.member, msg, expr)) - assert expr.endswith(')') - # Function call - if arg_str == '': - argvals = tuple() - else: - argvals = tuple([ + def eval_method(): + # nonlocal member + member = nl.member + if variable == 'String': + obj = str + elif variable in local_vars: + obj = local_vars[variable] + else: + if variable not in self._objects: + self._objects[variable] = self.extract_object(variable) + obj = self._objects[variable] + + if arg_str is None: + # Member access + if member == 'length': + return len(obj) + return obj[member] + + # Function call + argvals = [ self.interpret_expression(v, local_vars, allow_recursion) - for v in arg_str.split(',')]) + for v in self._separate(arg_str)] - if member == 'split': - assert argvals == ('',) - return list(obj) - if member == 'join': - assert len(argvals) == 1 - return argvals[0].join(obj) - if member == 'reverse': - assert len(argvals) == 0 - obj.reverse() - return obj - if member == 'slice': - assert len(argvals) == 1 - return obj[argvals[0]:] - if member == 'splice': - assert isinstance(obj, list) - index, howMany = argvals - res = [] - for i in range(index, min(index + howMany, len(obj))): - res.append(obj.pop(index)) - return res + if obj == str: + if member == 'fromCharCode': + assertion(argvals, 'takes one or more arguments') + return ''.join(map(chr, argvals)) + raise ExtractorError('Unsupported string method %s' % (member, )) - return obj[member](argvals) + if member == 'split': + assertion(argvals, 'takes one or more arguments') + assertion(argvals == [''], 'with arguments is not implemented') + return list(obj) + elif member == 'join': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(len(argvals) == 1, 'takes exactly one argument') + return argvals[0].join(obj) + elif member == 'reverse': + assertion(not argvals, 'does not take any arguments') + obj.reverse() + return obj + elif member == 'slice': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(len(argvals) == 1, 'takes exactly one argument') + return obj[argvals[0]:] + elif member == 'splice': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(argvals, 'takes one or more arguments') + index, howMany = (argvals + [len(obj)])[:2] + if index < 0: + index += len(obj) + add_items = argvals[2:] + res = [] + for i in range(index, min(index + howMany, len(obj))): + res.append(obj.pop(index)) + for i, item in enumerate(add_items): + obj.insert(index + i, item) + return res + elif member == 'unshift': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(argvals, 'takes one or more arguments') + for item in reversed(argvals): + obj.insert(0, item) + return obj + elif member == 'pop': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(not argvals, 'does not take any arguments') + if not obj: + return + return obj.pop() + elif member == 'push': + assertion(argvals, 'takes one or more arguments') + obj.extend(argvals) + return obj + elif member == 'forEach': + assertion(argvals, 'takes one or more arguments') + assertion(len(argvals) <= 2, 'takes at-most 2 arguments') + f, this = (argvals + [''])[:2] + return [f((item, idx, obj), this=this) for idx, item in enumerate(obj)] + elif member == 'indexOf': + assertion(argvals, 'takes one or more arguments') + assertion(len(argvals) <= 2, 'takes at-most 2 arguments') + idx, start = (argvals + [0])[:2] + try: + return obj.index(idx, start) + except ValueError: + return -1 - for op, opfunc in _OPERATORS: - m = re.match(r'(?P.+?)%s(?P.+)' % re.escape(op), expr) - if not m: - continue - x, abort = self.interpret_statement( - m.group('x'), local_vars, allow_recursion - 1) - if abort: - raise ExtractorError( - 'Premature left-side return of %s in %r' % (op, expr)) - y, abort = self.interpret_statement( - m.group('y'), local_vars, allow_recursion - 1) - if abort: - raise ExtractorError( - 'Premature right-side return of %s in %r' % (op, expr)) - return opfunc(x, y) + if isinstance(obj, list): + member = int(member) + nl.member = member + return obj[member](argvals) - m = re.match( - r'^(?P%s)\((?P[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) + if remaining: + return self.interpret_expression( + self._named_object(local_vars, eval_method()) + remaining, + local_vars, allow_recursion) + else: + return eval_method() + + m = re.match(r'^(?P%s)\((?P[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) if m: fname = m.group('func') argvals = tuple([ int(v) if v.isdigit() else local_vars[v] - for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple() - if fname not in self._functions: + for v in self._separate(m.group('args'))]) + if fname in local_vars: + return local_vars[fname](argvals) + elif fname not in self._functions: self._functions[fname] = self.extract_function(fname) return self._functions[fname](argvals) - raise ExtractorError('Unsupported JS expression %r' % expr) + if expr: + raise ExtractorError('Unsupported JS expression %r' % expr) def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' @@ -233,30 +501,52 @@ class JSInterpreter(object): return obj - def extract_function(self, funcname): + def extract_function_code(self, funcname): + """ @returns argnames, code """ func_m = re.search( r'''(?x) - (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + (?:function\s+%(f_n)s|[{;,]\s*%(f_n)s\s*=\s*function|var\s+%(f_n)s\s*=\s*function)\s* \((?P[^)]*)\)\s* - \{(?P[^}]+)\}''' % ( - re.escape(funcname), re.escape(funcname), re.escape(funcname)), + (?P\{(?:(?!};)[^"]|"([^"]|\\")*")+\})''' % {'f_n': re.escape(funcname), }, self.code) + code, _ = self._separate_at_paren(func_m.group('code'), '}') # refine the match if func_m is None: raise ExtractorError('Could not find JS function %r' % funcname) - argnames = func_m.group('args').split(',') + return func_m.group('args').split(','), code - return self.build_function(argnames, func_m.group('code')) + def extract_function(self, funcname): + return self.extract_function_from_code(*self.extract_function_code(funcname)) + + def extract_function_from_code(self, argnames, code, *global_stack): + local_vars = {} + while True: + mobj = re.search(r'function\((?P[^)]*)\)\s*{', code) + if mobj is None: + break + start, body_start = mobj.span() + body, remaining = self._separate_at_paren(code[body_start - 1:], '}') + name = self._named_object( + local_vars, + self.extract_function_from_code( + [str.strip(x) for x in mobj.group('args').split(',')], + body, local_vars, *global_stack)) + code = code[:start] + name + remaining + return self.build_function(argnames, code, local_vars, *global_stack) def call_function(self, funcname, *args): - f = self.extract_function(funcname) - return f(args) + return self.extract_function(funcname)(args) - def build_function(self, argnames, code): - def resf(args): - local_vars = dict(zip(argnames, args)) - for stmt in code.split(';'): - res, abort = self.interpret_statement(stmt, local_vars) - if abort: + def build_function(self, argnames, code, *global_stack): + global_stack = list(global_stack) or [{}] + local_vars = global_stack.pop(0) + + def resf(args, **kwargs): + local_vars.update(dict(zip(argnames, args))) + local_vars.update(kwargs) + var_stack = LocalNameSpace(local_vars, *global_stack) + for stmt in self._separate(code.replace('\n', ''), ';'): + ret, should_abort = self.interpret_statement(stmt, var_stack) + if should_abort: break - return res + return ret return resf From e1eae16b56b5c57e341b000167c0a92e67095e6e Mon Sep 17 00:00:00 2001 From: df Date: Thu, 4 Nov 2021 12:48:06 +0000 Subject: [PATCH 14/70] Handle default in switch better Add https://github.com/yt-dlp/yt-dlp/commit/a1fc7ca0743c8df06416e68ee74b64e07dfe7135 Thanks coletdjnz --- test/test_jsinterp.py | 15 +++++++++++++++ youtube_dl/jsinterp.py | 23 ++++++++++++++--------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 4d05ea610..acdabffb1 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -133,6 +133,21 @@ class TestJSInterpreter(unittest.TestCase): self.assertEqual(jsi.call_function('x', 3), 6) self.assertEqual(jsi.call_function('x', 5), 0) + def test_switch_default(self): + jsi = JSInterpreter(''' + function x(f) { switch(f){ + case 2: f+=2; + default: f-=1; + case 5: + case 6: f+=6; + case 0: break; + case 1: f+=1; + } return f } + ''') + self.assertEqual(jsi.call_function('x', 1), 2) + self.assertEqual(jsi.call_function('x', 5), 11) + self.assertEqual(jsi.call_function('x', 9), 14) + def test_try(self): jsi = JSInterpreter(''' function x() { try{return 10} catch(e){return 5} } diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 061e92c2a..c35765702 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -240,21 +240,26 @@ class JSInterpreter(object): switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion) body, expr = self._separate_at_paren(remaining, '}') - body, default = body.split('default:') if 'default:' in body else (body, None) - items = body.split('case ')[1:] - if default: - items.append('default:%s' % (default, )) - matched = False - for item in items: - case, stmt = [i.strip() for i in self._separate(item, ':', 1)] - matched = matched or case == 'default' or switch_val == self.interpret_expression(case, local_vars, allow_recursion) - if matched: + items = body.replace('default:', 'case default:').split('case ')[1:] + for default in (False, True): + matched = False + for item in items: + case, stmt = [i.strip() for i in self._separate(item, ':', 1)] + if default: + matched = matched or case == 'default' + elif not matched: + matched = (case != 'default' + and switch_val == self.interpret_expression(case, local_vars, allow_recursion)) + if not matched: + continue try: ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion - 1) if should_abort: return ret except JS_Break: break + if matched: + break return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] # Comma separated statements From 1ca673bd98cc5bbfa76d00ac84ad5f6c1376db01 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 27 Nov 2021 02:06:13 +0000 Subject: [PATCH 15/70] Fix splice to handle float Needed for new youtube js player f1ca6900 Add https://github.com/yt-dlp/yt-dlp/commit/57dbe8077f8d00e0fffac53669f40cd7d584474f#diff-729b57caa8d006426f6a8960c061f519a8b6658682284015e069745af52ffb07 --- youtube_dl/jsinterp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index c35765702..c75cf45b9 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -416,7 +416,7 @@ class JSInterpreter(object): elif member == 'splice': assertion(isinstance(obj, list), 'must be applied on a list') assertion(argvals, 'takes one or more arguments') - index, howMany = (argvals + [len(obj)])[:2] + index, howMany = map(int, (argvals + [len(obj)])[:2]) if index < 0: index += len(obj) add_items = argvals[2:] From 9d142109f445ea247e476cfc0e0ca134f6ebb802 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 27 Nov 2021 03:18:29 +0000 Subject: [PATCH 16/70] Back-port test_youtube_signature.py from yt-dlp and fix JSInterp accordingly --- test/test_youtube_signature.py | 89 ++++++++++++++++++++++++---------- youtube_dl/jsinterp.py | 9 ++-- 2 files changed, 69 insertions(+), 29 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 627d4cb92..c8e85b500 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -14,9 +14,10 @@ import string from test.helper import FakeYDL from youtube_dl.extractor import YoutubeIE +from youtube_dl.jsinterp import JSInterpreter from youtube_dl.compat import compat_str, compat_urlretrieve -_TESTS = [ +_SIG_TESTS = [ ( 'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', 86, @@ -64,6 +65,25 @@ _TESTS = [ ) ] +_NSIG_TESTS = [ + ( + 'https://www.youtube.com/s/player/9216d1f7/player_ias.vflset/en_US/base.js', + 'SLp9F5bwjAdhE9F-', 'gWnb9IK2DJ8Q1w', + ), + ( + 'https://www.youtube.com/s/player/f8cb7a3b/player_ias.vflset/en_US/base.js', + 'oBo2h5euWy6osrUt', 'ivXHpm7qJjJN', + ), + ( + 'https://www.youtube.com/s/player/2dfe380c/player_ias.vflset/en_US/base.js', + 'oBo2h5euWy6osrUt', '3DIBbn3qdQ', + ), + ( + 'https://www.youtube.com/s/player/f1ca6900/player_ias.vflset/en_US/base.js', + 'cu3wyu6LQn2hse', 'jvxetvmlI9AN9Q', + ), +] + class TestPlayerInfo(unittest.TestCase): def test_youtube_extract_player_info(self): @@ -95,35 +115,54 @@ class TestSignature(unittest.TestCase): os.mkdir(self.TESTDATA_DIR) -def make_tfunc(url, sig_input, expected_sig): - m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$', url) - assert m, '%r should follow URL format' % url - test_id = m.group(1) +def t_factory(name, sig_func, url_pattern): + def make_tfunc(url, sig_input, expected_sig): + m = url_pattern.match(url) + assert m, '%r should follow URL format' % url + test_id = m.group('id') - def test_func(self): - basename = 'player-%s.js' % test_id - fn = os.path.join(self.TESTDATA_DIR, basename) + def test_func(self): + basename = 'player-{0}-{1}.js'.format(name, test_id) + fn = os.path.join(self.TESTDATA_DIR, basename) - if not os.path.exists(fn): - compat_urlretrieve(url, fn) + if not os.path.exists(fn): + compat_urlretrieve(url, fn) + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + self.assertEqual(sig_func(jscode, sig_input), expected_sig) - ydl = FakeYDL() - ie = YoutubeIE(ydl) - with io.open(fn, encoding='utf-8') as testf: - jscode = testf.read() - func = ie._parse_sig_js(jscode) - src_sig = ( - compat_str(string.printable[:sig_input]) - if isinstance(sig_input, int) else sig_input) - got_sig = func(src_sig) - self.assertEqual(got_sig, expected_sig) - - test_func.__name__ = str('test_signature_js_' + test_id) - setattr(TestSignature, test_func.__name__, test_func) + test_func.__name__ = str('test_{0}_js_{1}'.format(name, test_id)) + setattr(TestSignature, test_func.__name__, test_func) + return make_tfunc -for test_spec in _TESTS: - make_tfunc(*test_spec) +def signature(jscode, sig_input): + func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) + src_sig = ( + compat_str(string.printable[:sig_input]) + if isinstance(sig_input, int) else sig_input) + return func(src_sig) + + +def n_sig(jscode, sig_input): + # Pending implementation of _extract_n_function_name() or similar in + # youtube.py, hard-code here + # funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) + import re + funcname = re.search(r'[=(,&|](\w+)\(\w+\),\w+\.set\("n",', jscode) + funcname = funcname and funcname.group(1) + return JSInterpreter(jscode).call_function(funcname, sig_input) + + +make_sig_test = t_factory( + 'signature', signature, re.compile(r'.*-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$')) +for test_spec in _SIG_TESTS: + make_sig_test(*test_spec) + +make_nsig_test = t_factory( + 'nsig', n_sig, re.compile(r'.+/player/(?P[a-zA-Z0-9_-]+)/.+.js$')) +for test_spec in _NSIG_TESTS: + make_nsig_test(*test_spec) if __name__ == '__main__': diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index c75cf45b9..a2306557b 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -9,7 +9,8 @@ from .utils import ( remove_quotes, ) from .compat import ( - compat_collections_abc + compat_collections_abc, + compat_str, ) MutableMapping = compat_collections_abc.MutableMapping @@ -372,7 +373,7 @@ class JSInterpreter(object): # nonlocal member member = nl.member if variable == 'String': - obj = str + obj = compat_str elif variable in local_vars: obj = local_vars[variable] else: @@ -391,7 +392,7 @@ class JSInterpreter(object): self.interpret_expression(v, local_vars, allow_recursion) for v in self._separate(arg_str)] - if obj == str: + if obj == compat_str: if member == 'fromCharCode': assertion(argvals, 'takes one or more arguments') return ''.join(map(chr, argvals)) @@ -533,7 +534,7 @@ class JSInterpreter(object): name = self._named_object( local_vars, self.extract_function_from_code( - [str.strip(x) for x in mobj.group('args').split(',')], + [x.strip() for x in mobj.group('args').split(',')], body, local_vars, *global_stack)) code = code[:start] + name + remaining return self.build_function(argnames, code, local_vars, *global_stack) From 6ca7b776965ed1e9220690edc4ee22de8c8587f5 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 10 Dec 2021 19:14:54 +0000 Subject: [PATCH 17/70] Refactor JSInterpreter._separate yt-dlp/yt-dlp/@06dfe0a, improve _MATCHING_PARENS --- youtube_dl/jsinterp.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a2306557b..8eaa911cd 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -36,6 +36,8 @@ _ASSIGN_OPERATORS.append(('=', (lambda cur, right: right))) _NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' +_MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) + class JS_Break(ExtractorError): def __init__(self): @@ -100,26 +102,24 @@ class JSInterpreter(object): def _separate(expr, delim=',', max_split=None): if not expr: return - parens = {'(': 0, '{': 0, '[': 0, ']': 0, '}': 0, ')': 0} - start, splits, pos, max_pos = 0, 0, 0, len(delim) - 1 + counters = {k: 0 for k in _MATCHING_PARENS.values()} + start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 for idx, char in enumerate(expr): - if char in parens: - parens[char] += 1 - is_in_parens = (parens['['] - parens[']'] - or parens['('] - parens[')'] - or parens['{'] - parens['}']) - if char == delim[pos] and not is_in_parens: - if pos == max_pos: - pos = 0 - yield expr[start: idx - max_pos] - start = idx + 1 - splits += 1 - if max_split and splits >= max_split: - break - else: - pos += 1 - else: + if char in _MATCHING_PARENS: + counters[_MATCHING_PARENS[char]] += 1 + elif char in counters: + counters[char] -= 1 + if char != delim[pos] or any(counters.values()): pos = 0 + continue + elif pos != delim_len: + pos += 1 + continue + yield expr[start: idx - delim_len] + start, pos = idx + 1, 0 + splits += 1 + if max_split and splits >= max_split: + break yield expr[start:] @staticmethod From af9e72507ea38e5ab3fa2751ed09ec88021260cb Mon Sep 17 00:00:00 2001 From: df Date: Mon, 1 Nov 2021 04:45:42 +0000 Subject: [PATCH 18/70] Implement n-param descrambling using JSInterp Fixes #29326, closes #29790, closes #30004, closes #30024, closes #30052, closes #30088, closes #30097, closes #30102, closes #30109, closes #30119, closes #30125, closes #30128, closes #30162, closes #30173, closes #30186, closes #30192, closes #30221, closes #30239, closes #30539, closes #30552. --- youtube_dl/extractor/youtube.py | 115 +++++++++++++++++++++++++++----- 1 file changed, 99 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index da410f8f0..63918924d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1254,6 +1254,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError('Cannot identify player %r' % player_url) return id_m.group('id') + def _get_player_code(self, video_id, player_url, player_id=None): + if not player_id: + player_id = self._extract_player_info(player_url) + + if player_id not in self._code_cache: + self._code_cache[player_id] = self._download_webpage( + player_url, video_id, + note='Downloading player ' + player_id, + errnote='Download of %s failed' % player_url) + return self._code_cache[player_id] + def _extract_signature_function(self, video_id, player_url, example_sig): player_id = self._extract_player_info(player_url) @@ -1266,12 +1277,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) - if player_id not in self._code_cache: - self._code_cache[player_id] = self._download_webpage( - player_url, video_id, - note='Downloading player ' + player_id, - errnote='Download of %s failed' % player_url) - code = self._code_cache[player_id] + code = self._get_player_code(video_id, player_url, player_id) res = self._parse_sig_js(code) test_string = ''.join(map(compat_chr, range(len(example_sig)))) @@ -1350,11 +1356,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url is None: raise ExtractorError('Cannot decrypt signature without player_url') - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) try: player_id = (player_url, self._signature_cache_id(s)) if player_id not in self._player_cache: @@ -1371,6 +1372,88 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) + def _extract_player_url(self, webpage): + player_url = self._search_regex( + r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', + webpage or '', 'player URL', fatal=False) + if not player_url: + return + if player_url.startswith('//'): + player_url = 'https:' + player_url + elif not re.match(r'https?://', player_url): + player_url = compat_urlparse.urljoin( + 'https://www.youtube.com', player_url) + return player_url + + # from yt-dlp + # See also: + # 1. https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-894619419 + # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116 + # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377 + def _extract_n_function_name(self, jscode): + return self._search_regex( + (r'\.get\("n"\)\)&&\(b=(?P[a-zA-Z0-9$]{3})\([a-zA-Z0-9]\)',), + jscode, 'Initial JS player n function name', group='nfunc') + + def _extract_n_function(self, video_id, player_url): + player_id = self._extract_player_info(player_url) + func_code = self._downloader.cache.load('youtube-nsig', player_id) + + if func_code: + jsi = JSInterpreter(func_code) + else: + player_id = self._extract_player_info(player_url) + jscode = self._get_player_code(video_id, player_url, player_id) + funcname = self._extract_n_function_name(jscode) + jsi = JSInterpreter(jscode) + func_code = jsi.extract_function_code(funcname) + self._downloader.cache.store('youtube-nsig', player_id, func_code) + + if self._downloader.params.get('youtube_print_sig_code'): + self.to_screen('Extracted nsig function from {0}:\n{1}\n'.format(player_id, func_code[1])) + + return lambda s: jsi.extract_function_from_code(*func_code)([s]) + + def _n_descramble(self, n_param, player_url, video_id): + """Compute the response to YT's "n" parameter challenge + + Args: + n_param -- challenge string that is the value of the + URL's "n" query parameter + player_url -- URL of YT player JS + video_id + """ + + sig_id = ('nsig_value', n_param) + if sig_id in self._player_cache: + return self._player_cache[sig_id] + + try: + player_id = ('nsig', player_url) + if player_id not in self._player_cache: + self._player_cache[player_id] = self._extract_n_function(video_id, player_url) + func = self._player_cache[player_id] + self._player_cache[sig_id] = func(n_param) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen('[debug] [%s] %s' % (self.IE_NAME, 'Decrypted nsig {0} => {1}'.format(n_param, self._player_cache[sig_id]))) + return self._player_cache[sig_id] + except Exception as e: + raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) + + def _unthrottle_format_urls(self, video_id, player_url, formats): + for fmt in formats: + parsed_fmt_url = compat_urlparse.urlparse(fmt['url']) + qs = compat_urlparse.parse_qs(parsed_fmt_url.query) + n_param = qs.get('n') + if not n_param: + continue + n_param = n_param[-1] + n_response = self._n_descramble(n_param, player_url, video_id) + if n_response: + qs['n'] = [n_response] + fmt['url'] = compat_urlparse.urlunparse( + parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + def _mark_watched(self, video_id, player_response): playback_url = url_or_none(try_get( player_response, @@ -1632,11 +1715,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not (sc and fmt_url and encrypted_sig): continue if not player_url: - if not webpage: - continue - player_url = self._search_regex( - r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', - webpage, 'player URL', fatal=False) + player_url = self._extract_player_url(webpage) if not player_url: continue signature = self._decrypt_signature(sc['s'][0], video_id, player_url) @@ -1782,6 +1861,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): is_live = video_details.get('isLive') owner_profile_url = microformat.get('ownerProfileUrl') + if not player_url: + player_url = self._extract_player_url(webpage) + self._unthrottle_format_urls(video_id, player_url, formats) + info = { 'id': video_id, 'title': self._live_title(video_title) if is_live else video_title, From 1e677567cd083d43f55daef0cc74e5fa24575ae3 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 1 Feb 2022 14:39:03 +0000 Subject: [PATCH 19/70] [YouTube] Fix n-sig for player e06dea74 (#30582) From yt-dl commit 48416bc --- test/test_youtube_signature.py | 24 +++++++++++++++++------- youtube_dl/extractor/youtube.py | 14 +++++++++++--- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index c8e85b500..fc5e9828e 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -82,6 +82,14 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/f1ca6900/player_ias.vflset/en_US/base.js', 'cu3wyu6LQn2hse', 'jvxetvmlI9AN9Q', ), + ( + 'https://www.youtube.com/s/player/8040e515/player_ias.vflset/en_US/base.js', + 'wvOFaY-yjgDuIEg5', 'HkfBFDHmgw4rsw', + ), + ( + 'https://www.youtube.com/s/player/e06dea74/player_ias.vflset/en_US/base.js', + 'AiuodmaDDYw8d3y4bf', 'ankd8eza2T6Qmw', + ), ] @@ -110,10 +118,17 @@ class TestPlayerInfo(unittest.TestCase): class TestSignature(unittest.TestCase): def setUp(self): TEST_DIR = os.path.dirname(os.path.abspath(__file__)) - self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata') + self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata/sigs') if not os.path.exists(self.TESTDATA_DIR): os.mkdir(self.TESTDATA_DIR) + def tearDown(self): + try: + for f in os.listdir(self.TESTDATA_DIR): + os.remove(f) + except OSError: + pass + def t_factory(name, sig_func, url_pattern): def make_tfunc(url, sig_input, expected_sig): @@ -145,12 +160,7 @@ def signature(jscode, sig_input): def n_sig(jscode, sig_input): - # Pending implementation of _extract_n_function_name() or similar in - # youtube.py, hard-code here - # funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) - import re - funcname = re.search(r'[=(,&|](\w+)\(\w+\),\w+\.set\("n",', jscode) - funcname = funcname and funcname.group(1) + funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) return JSInterpreter(jscode).call_function(funcname, sig_input) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 63918924d..7943b94f9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -28,6 +28,7 @@ from ..utils import ( dict_get, float_or_none, int_or_none, + js_to_json, mimetype2ext, parse_codecs, parse_duration, @@ -1391,9 +1392,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116 # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377 def _extract_n_function_name(self, jscode): - return self._search_regex( - (r'\.get\("n"\)\)&&\(b=(?P[a-zA-Z0-9$]{3})\([a-zA-Z0-9]\)',), - jscode, 'Initial JS player n function name', group='nfunc') + target = r'(?P[a-zA-Z0-9$]{3})(?:\[(?P\d+)\])?' + nfunc_and_idx = self._search_regex( + r'\.get\("n"\)\)&&\(b=(%s)\([a-zA-Z0-9]\)' % (target, ), + jscode, 'Initial JS player n function name') + nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx') + if not idx: + return nfunc + return self._parse_json(self._search_regex( + r'var %s\s*=\s*(\[.+?\]);' % (nfunc, ), jscode, + 'Initial JS player n function list ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)] def _extract_n_function(self, video_id, player_url): player_id = self._extract_player_info(player_url) From 34c06b16f5eb814308392b68dce07bbff62bc406 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 31 Jan 2022 00:02:56 +0000 Subject: [PATCH 20/70] Support Youtube Shorts URL format --- youtube_dl/extractor/youtube.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7943b94f9..05688dc70 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -417,6 +417,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ + |shorts/ |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! @@ -1119,6 +1120,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + # YT 'Shorts' + 'url': 'https://youtube.com/shorts/4L2J27mJ3Dc', + 'info_dict': { + 'id': '4L2J27mJ3Dc', + 'ext': 'mp4', + 'upload_date': '20211025', + 'uploader': 'Charlie Berens', + 'description': 'md5:976512b8a29269b93bbd8a61edc45a6d', + 'uploader_id': 'fivedlrmilkshake', + 'title': 'Midwest Squid Game #Shorts', + }, + 'params': { + 'skip_download': True, + }, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, From 41f0043983c831b7c0c3614340d2f66ec153087b Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 1 Feb 2022 23:22:57 +0000 Subject: [PATCH 21/70] Avoid crashing if n-sig decode fails --- youtube_dl/extractor/youtube.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 05688dc70..4165de15c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,6 +26,7 @@ from ..utils import ( ExtractorError, clean_html, dict_get, + error_to_compat_str, float_or_none, int_or_none, js_to_json, @@ -1463,7 +1464,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.to_screen('[debug] [%s] %s' % (self.IE_NAME, 'Decrypted nsig {0} => {1}'.format(n_param, self._player_cache[sig_id]))) return self._player_cache[sig_id] except Exception as e: - raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) + self._downloader.report_warning( + '[%s] %s (%s %s)' % ( + self.IE_NAME, + 'Unable to decode n-parameter: download likely to be throttled', + error_to_compat_str(e), + traceback.format_exc())) def _unthrottle_format_urls(self, video_id, player_url, formats): for fmt in formats: From 78ce962f4fe020994c216dd2671546fbe58a5c67 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 30 Jan 2022 01:24:09 +0530 Subject: [PATCH 22/70] [youtube] Support channel search Code from https://github.com/yt-dlp/yt-dlp/commit/cd684175adbe663bbdf6a6c72d8b99b617b6ff2e --- youtube_dl/extractor/youtube.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4165de15c..8e1254f19 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2438,6 +2438,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', 'only_matching': True, + }, { + 'note': 'Search tab', + 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', + 'playlist_mincount': 40, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Search - linear algebra', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader': '3Blue1Brown', + 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + } }] @classmethod @@ -2835,8 +2846,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @staticmethod def _extract_selected_tab(tabs): for tab in tabs: - if try_get(tab, lambda x: x['tabRenderer']['selected'], bool): - return tab['tabRenderer'] + renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {} + if renderer.get('selected') is True: + return renderer else: raise ExtractorError('Unable to find selected tab') @@ -2893,6 +2905,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): title = channel_title or item_id if tab_title: title += ' - %s' % tab_title + if selected_tab.get('expandedText'): + title += ' - %s' % selected_tab['expandedText'] description = renderer.get('description') playlist_id = renderer.get('externalId') else: From 7a497f1405ecdcd76c671c7bfaad238d75d01639 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 4 Feb 2022 04:09:23 +0000 Subject: [PATCH 23/70] Rework 2c2c2bd with an actual Mix page and realistic playlist size From https://github.com/ytdl-org/youtube-dl/commit/2c2c2bd348b7dce0aad55a6fc37a18c6f9a000e3#commitcomment-65953545 --- test/test_youtube_lists.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 07a6b6d06..e0e8891ba 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -36,12 +36,12 @@ class TestYoutubeLists(unittest.TestCase): dl = FakeYDL() dl.params['format'] = 'best' ie = YoutubeTabIE(dl) - result = dl.extract_info('https://www.youtube.com/watch?v=uVJ0Il5WvbE&list=PLhQjrBD2T381k8ul4WQ8SQ165XqY149WW', + result = dl.extract_info('https://www.youtube.com/watch?v=tyITL_exICo&list=RDCLAK5uy_kLWIr9gv1XLlPbaDS965-Db4TrBoUTxQ8', download=False, ie_key=ie.ie_key(), process=True) entries = (result or {}).get('entries', [{'id': 'not_found', }]) - self.assertTrue(len(entries) >= 50) + self.assertTrue(len(entries) >= 25) original_video = entries[0] - self.assertEqual(original_video['id'], 'uVJ0Il5WvbE') + self.assertEqual(original_video['id'], 'tyITL_exICo') def test_youtube_flat_playlist_extraction(self): dl = FakeYDL() From 0c0876f790c78c38ececbc920073e8b6cf01e9c7 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 3 Feb 2022 07:44:37 +0530 Subject: [PATCH 24/70] [youtube:search] Add tests --- youtube_dl/extractor/youtube.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3ab60960a..41695a561 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3206,7 +3206,14 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): _SEARCH_KEY = 'ytsearch' _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only _MAX_RESULTS = float('inf') - _TESTS = [] + _TESTS = [{ + 'url': 'ytsearch10:youtube-dl test video', + 'playlist_count': 10, + 'info_dict': { + 'id': 'youtube-dl test video', + 'title': 'youtube-dl test video', + } + }] def _get_n_results(self, query, n): """Get a specified number of results for a query""" @@ -3219,7 +3226,14 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _SEARCH_KEY = 'ytsearchdate' IE_DESC = 'YouTube.com searches, newest videos first' _SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date - _TESTS = [] + _TESTS = [{ + 'url': 'ytsearchdate10:youtube-dl test video', + 'playlist_count': 10, + 'info_dict': { + 'id': 'youtube-dl test video', + 'title': 'youtube-dl test video', + } + }] class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): @@ -3232,7 +3246,8 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'youtube-dl test video', 'title': 'youtube-dl test video', - } + }, + 'params': {'playlistend': 5} }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'only_matching': True, From 61d791726f67255c2ed3c0bb6ee24c8c1faeb028 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 4 Feb 2022 11:24:03 +0000 Subject: [PATCH 25/70] Find TV2DK Kaltura ID in Nuxt.js page format --- youtube_dl/extractor/tv2dk.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 8bd5fd640..106a081e1 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -41,8 +41,16 @@ class TV2DKIE(InfoExtractor): 'duration': 1347, 'view_count': int, }, - 'params': { - 'skip_download': True, + 'add_ie': ['Kaltura'], + }, { + 'url': 'https://www.tv2lorry.dk/gadekamp/gadekamp-6-hoejhuse-i-koebenhavn', + 'info_dict': { + 'id': '1_7iwll9n0', + 'ext': 'mp4', + 'upload_date': '20211027', + 'title': 'Gadekamp #6 - Højhuse i København', + 'uploader_id': 'tv2lorry', + 'timestamp': 1635345229, }, 'add_ie': ['Kaltura'], }, { @@ -91,7 +99,8 @@ class TV2DKIE(InfoExtractor): add_entry(partner_id, kaltura_id) if not entries: kaltura_id = self._search_regex( - r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id') + (r'entry_id\s*:\s*["\']([0-9a-z_]+)', + r'\\u002FentryId\\u002F(\w+)\\u002F'), webpage, 'kaltura id') partner_id = self._search_regex( (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, 'partner id') From 27dbf6f0ab778a9e3d81be64a615046e6737c3f6 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 4 Feb 2022 11:38:44 +0000 Subject: [PATCH 26/70] Return the item itself if playlist has one entry Removes playlist spam from log --- youtube_dl/extractor/tv2dk.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 106a081e1..ec5cbdf03 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -105,6 +105,8 @@ class TV2DKIE(InfoExtractor): (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, 'partner id') add_entry(partner_id, kaltura_id) + if len(entries) == 1: + return entries[0] return self.playlist_result(entries) From 8248133e5ee5579316120cbcbff3ba8b713f1017 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 4 Feb 2022 11:29:41 +0000 Subject: [PATCH 27/70] Back-port yt-dlp Viki extractor From https://github.com/yt-dlp/yt-dlp/pull/2540 --- youtube_dl/extractor/viki.py | 335 +++++++++++++++-------------------- 1 file changed, 144 insertions(+), 191 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 2e9cbf148..2ddca0ca6 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,38 +1,29 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import hashlib import hmac -import itertools import json -import re import time from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) from ..utils import ( ExtractorError, int_or_none, parse_age_limit, parse_iso8601, - sanitized_Request, - std_headers, try_get, ) class VikiBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' - _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' - _API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s' + _API_URL_TEMPLATE = 'https://api.viki.io%s' + _DEVICE_ID = '112395910d' _APP = '100005a' - _APP_VERSION = '6.0.0' - _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' + _APP_VERSION = '6.11.3' + _APP_SECRET = 'd96704b180208dbb2efa30fe44c48bd8690441af9f567ba8fd710a72badc85198f7472' _GEO_BYPASS = False _NETRC_MACHINE = 'viki' @@ -45,43 +36,60 @@ class VikiBaseIE(InfoExtractor): 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers', } - def _prepare_call(self, path, timestamp=None, post_data=None): + def _stream_headers(self, timestamp, sig): + return { + 'X-Viki-manufacturer': 'vivo', + 'X-Viki-device-model': 'vivo 1606', + 'X-Viki-device-os-ver': '6.0.1', + 'X-Viki-connection-type': 'WIFI', + 'X-Viki-carrier': '', + 'X-Viki-as-id': '100005a-1625321982-3932', + 'timestamp': str(timestamp), + 'signature': str(sig), + 'x-viki-app-ver': self._APP_VERSION + } + + def _api_query(self, path, version=4, **kwargs): path += '?' if '?' not in path else '&' - if not timestamp: - timestamp = int(time.time()) - query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp) + app = self._APP + query = '/v{version}/{path}app={app}'.format(**locals()) if self._token: query += '&token=%s' % self._token + return query + ''.join('&{name}={val}.format(**locals())' for name, val in kwargs.items()) + + def _sign_query(self, path): + timestamp = int(time.time()) + query = self._api_query(path, version=5) sig = hmac.new( self._APP_SECRET.encode('ascii'), - query.encode('ascii'), - hashlib.sha1 - ).hexdigest() - url = self._API_URL_TEMPLATE % (query, sig) - return sanitized_Request( - url, json.dumps(post_data).encode('utf-8')) if post_data else url + '{query}&t={timestamp}'.format(**locals()).encode('ascii'), + hashlib.sha1).hexdigest() + return timestamp, sig, self._API_URL_TEMPLATE % query - def _call_api(self, path, video_id, note, timestamp=None, post_data=None): + def _call_api( + self, path, video_id, note='Downloading JSON metadata', data=None, query=None, fatal=True): + if query is None: + timestamp, sig, url = self._sign_query(path) + else: + url = self._API_URL_TEMPLATE % self._api_query(path, version=4) resp = self._download_json( - self._prepare_call(path, timestamp, post_data), video_id, note, - headers={'x-viki-app-ver': self._APP_VERSION}) - - error = resp.get('error') - if error: - if error == 'invalid timestamp': - resp = self._download_json( - self._prepare_call(path, int(resp['current_timestamp']), post_data), - video_id, '%s (retry)' % note) - error = resp.get('error') - if error: - self._raise_error(resp['error']) + url, video_id, note, fatal=fatal, query=query, + data=json.dumps(data).encode('utf-8') if data else None, + headers=({'x-viki-app-ver': self._APP_VERSION} if data + else self._stream_headers(timestamp, sig) if query is None + else None), expected_status=400) or {} + self._raise_error(resp.get('error'), fatal) return resp - def _raise_error(self, error): - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error), - expected=True) + def _raise_error(self, error, fatal=True): + if error is None: + return + msg = '%s said: %s' % (self.IE_NAME, error) + if fatal: + raise ExtractorError(msg, expected=True) + else: + self.report_warning(msg) def _check_errors(self, data): for reason, status in (data.get('blocking') or {}).items(): @@ -90,9 +98,10 @@ class VikiBaseIE(InfoExtractor): if reason == 'geo': self.raise_geo_restricted(msg=message) elif reason == 'paywall': + if try_get(data, lambda x: x['paywallable']['tvod']): + self._raise_error('This video is for rent only or TVOD (Transactional Video On demand)') self.raise_login_required(message) - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, message), expected=True) + self._raise_error(message) def _real_initialize(self): self._login() @@ -102,35 +111,39 @@ class VikiBaseIE(InfoExtractor): if username is None: return - login_form = { - 'login_id': username, - 'password': password, - } - - login = self._call_api( - 'sessions.json', None, - 'Logging in', post_data=login_form) - - self._token = login.get('token') + self._token = self._call_api( + 'sessions.json', None, 'Logging in', fatal=False, + data={'username': username, 'password': password}).get('token') if not self._token: - self.report_warning('Unable to get session token, login has probably failed') + self.report_warning('Login Failed: Unable to get session token') @staticmethod - def dict_selection(dict_obj, preferred_key, allow_fallback=True): + def dict_selection(dict_obj, preferred_key): if preferred_key in dict_obj: - return dict_obj.get(preferred_key) - - if not allow_fallback: - return - - filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()])) - return filtered_dict[0] if filtered_dict else None + return dict_obj[preferred_key] + return (list(filter(None, dict_obj.values())) or [None])[0] class VikiIE(VikiBaseIE): IE_NAME = 'viki' _VALID_URL = r'%s(?:videos|player)/(?P[0-9]+v)' % VikiBaseIE._VALID_URL_BASE _TESTS = [{ + 'note': 'Free non-DRM video with storyboards in MPD', + 'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1', + 'info_dict': { + 'id': '1175236v', + 'ext': 'mp4', + 'title': 'Choosing Spouse by Lottery - Episode 1', + 'timestamp': 1606463239, + 'age_limit': 12, + 'uploader': 'FCC', + 'upload_date': '20201127', + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], + 'params': { + 'format': 'bestvideo', + }, + }, { 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'info_dict': { 'id': '1023585v', @@ -146,7 +159,7 @@ class VikiIE(VikiBaseIE): 'params': { 'format': 'bestvideo', }, - 'skip': 'Blocked in the US', + 'skip': 'Content is only available to Viki Pass Plus subscribers', 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # clip @@ -178,11 +191,11 @@ class VikiIE(VikiBaseIE): 'like_count': int, 'age_limit': 13, }, - 'skip': 'Blocked in the US', + 'skip': 'Page not found!', }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '0a53dc252e6e690feccd756861495a8c', + 'md5': '670440c79f7109ca6564d4c7f24e3e81', 'info_dict': { 'id': '44699v', 'ext': 'mp4', @@ -193,7 +206,7 @@ class VikiIE(VikiBaseIE): 'upload_date': '20100405', 'uploader': 'group8', 'like_count': int, - 'age_limit': 13, + 'age_limit': 15, 'episode_number': 1, }, 'params': { @@ -224,7 +237,7 @@ class VikiIE(VikiBaseIE): }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '41faaba0de90483fb4848952af7c7d0d', + 'md5': '78bf49fdaa51f9e7f9150262a9ef9bdf', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -232,8 +245,8 @@ class VikiIE(VikiBaseIE): 'upload_date': '20111122', 'timestamp': 1321985454, 'description': 'md5:44b1e46619df3a072294645c770cef36', - 'title': 'Love In Magic', - 'age_limit': 13, + 'title': 'Love in Magic', + 'age_limit': 15, }, 'params': { 'format': 'bestvideo', @@ -244,45 +257,53 @@ class VikiIE(VikiBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - resp = self._download_json( - 'https://www.viki.com/api/videos/' + video_id, - video_id, 'Downloading video JSON', headers={ - 'x-client-user-agent': std_headers['User-Agent'], - 'x-viki-app-ver': '3.0.0', - }) - video = resp['video'] + video = self._call_api('videos/{0}.json'.format(video_id), video_id, 'Downloading video JSON', query={}) self._check_errors(video) - title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) + title = try_get(video, lambda x: x['titles']['en'], str) episode_number = int_or_none(video.get('number')) if not title: title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {} container_title = self.dict_selection(container_titles, 'en') - title = '%s - %s' % (container_title, title) + if container_title and title == video_id: + title = container_title + else: + title = '%s - %s' % (container_title, title) + + resp = self._call_api( + 'playback_streams/%s.json?drms=dt3&device_id=%s' % (video_id, self._DEVICE_ID), + video_id, 'Downloading video streams JSON')['main'][0] + + mpd_url = resp['url'] + # 720p is hidden in another MPD which can be found in the current manifest content + mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest') + mpd_url = self._search_regex( + r'(?mi)(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url) + if 'mpdhd_high' not in mpd_url: + # Modify the URL to get 1080p + mpd_url = mpd_url.replace('mpdhd', 'mpdhd_high') + formats = self._extract_mpd_formats(mpd_url, video_id) + self._sort_formats(formats) description = self.dict_selection(video.get('descriptions', {}), 'en') - + thumbnails = [{ + 'id': thumbnail_id, + 'url': thumbnail['url'], + } for thumbnail_id, thumbnail in (video.get('images') or {}).items() if thumbnail.get('url')] like_count = int_or_none(try_get(video, lambda x: x['likes']['count'])) - thumbnails = [] - for thumbnail_id, thumbnail in (video.get('images') or {}).items(): - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail.get('url'), - }) + stream_id = try_get(resp, lambda x: x['properties']['track']['stream_id']) + subtitles = dict((lang, [{ + 'ext': ext, + 'url': self._API_URL_TEMPLATE % self._api_query( + 'videos/{0}/auth_subtitles/{1}.{2}'.format(video_id, lang, ext), stream_id=stream_id) + } for ext in ('srt', 'vtt')]) for lang in (video.get('subtitle_completions') or {}).keys()) - subtitles = {} - for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items(): - subtitles[subtitle_lang] = [{ - 'ext': subtitles_format, - 'url': self._prepare_call( - 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), - } for subtitles_format in ('srt', 'vtt')] - - result = { + return { 'id': video_id, + 'formats': formats, 'title': title, 'description': description, 'duration': int_or_none(video.get('duration')), @@ -296,79 +317,6 @@ class VikiIE(VikiBaseIE): 'episode_number': episode_number, } - formats = [] - - def add_format(format_id, format_dict, protocol='http'): - # rtmps URLs does not seem to work - if protocol == 'rtmps': - return - format_url = format_dict.get('url') - if not format_url: - return - qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query) - stream = qs.get('stream', [None])[0] - if stream: - format_url = base64.b64decode(stream).decode() - if format_id in ('m3u8', 'hls'): - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False) - # Despite CODECS metadata in m3u8 all video-only formats - # are actually video+audio - for f in m3u8_formats: - if '_drm/index_' in f['url']: - continue - if f.get('acodec') == 'none' and f.get('vcodec') != 'none': - f['acodec'] = None - formats.append(f) - elif format_id in ('mpd', 'dash'): - formats.extend(self._extract_mpd_formats( - format_url, video_id, 'mpd-%s' % protocol, fatal=False)) - elif format_url.startswith('rtmp'): - mobj = re.search( - r'^(?Prtmp://[^/]+/(?P.+?))/(?Pmp4:.+)$', - format_url) - if not mobj: - return - formats.append({ - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': url, - }) - else: - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (format_id, protocol), - 'height': int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)), - }) - - for format_id, format_dict in (resp.get('streams') or {}).items(): - add_format(format_id, format_dict) - if not formats: - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - - if 'external' in streams: - result.update({ - '_type': 'url_transparent', - 'url': streams['external']['url'], - }) - return result - - for format_id, stream_dict in streams.items(): - for protocol, format_dict in stream_dict.items(): - add_format(format_id, format_dict, protocol) - self._sort_formats(formats) - - result['formats'] = formats - return result - class VikiChannelIE(VikiBaseIE): IE_NAME = 'viki:channel' @@ -378,9 +326,9 @@ class VikiChannelIE(VikiBaseIE): 'info_dict': { 'id': '50c', 'title': 'Boys Over Flowers', - 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', + 'description': 'md5:f08b679c200e1a273c695fe9986f21d7', }, - 'playlist_mincount': 71, + 'playlist_mincount': 51, }, { 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', 'info_dict': { @@ -401,33 +349,38 @@ class VikiChannelIE(VikiBaseIE): 'only_matching': True, }] - _PER_PAGE = 25 + _video_types = ('episodes', 'movies', 'clips', 'trailers') + + def _entries(self, channel_id): + params = { + 'app': self._APP, 'token': self._token, 'only_ids': 'true', + 'direction': 'asc', 'sort': 'number', 'per_page': 30 + } + video_types = self._video_types + for video_type in video_types: + if video_type not in self._video_types: + self.report_warning('Unknown video_type: ' + video_type) + page_num = 0 + while True: + page_num += 1 + params['page'] = page_num + res = self._call_api( + 'containers/{channel_id}/{video_type}.json'.format(**locals()), channel_id, query=params, fatal=False, + note='Downloading %s JSON page %d' % (video_type.title(), page_num)) + + for video_id in res.get('response') or []: + yield self.url_result('https://www.viki.com/videos/' + video_id, VikiIE.ie_key(), video_id) + if not res.get('more'): + break def _real_extract(self, url): channel_id = self._match_id(url) - channel = self._call_api( - 'containers/%s.json' % channel_id, channel_id, - 'Downloading channel JSON') + channel = self._call_api('containers/%s.json' % channel_id, channel_id, 'Downloading channel JSON') self._check_errors(channel) - title = self.dict_selection(channel['titles'], 'en') - - description = self.dict_selection(channel['descriptions'], 'en') - - entries = [] - for video_type in ('episodes', 'clips', 'movies'): - for page_num in itertools.count(1): - page = self._call_api( - 'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d' - % (channel_id, video_type, self._PER_PAGE, page_num), channel_id, - 'Downloading %s JSON page #%d' % (video_type, page_num)) - for video in page['response']: - video_id = video['id'] - entries.append(self.url_result( - 'https://www.viki.com/videos/%s' % video_id, 'Viki')) - if not page['pagination']['next']: - break - - return self.playlist_result(entries, channel_id, title, description) + return self.playlist_result( + self._entries(channel_id), channel_id, + self.dict_selection(channel['titles'], 'en'), + self.dict_selection(channel['descriptions'], 'en')) From b494824286f0ac2fc7313452b287fbbffe61ccbe Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 17 Jan 2022 13:11:11 +0000 Subject: [PATCH 28/70] Support Tele5 pages with Discovery Networks format instead of JWPlatform --- youtube_dl/extractor/tele5.py | 86 ++++++++++++++--------------------- 1 file changed, 35 insertions(+), 51 deletions(-) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 3e1a7a9e6..df02dfc47 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -1,19 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from .jwplatform import JWPlatformIE -from .nexx import NexxIE from ..compat import compat_urlparse from ..utils import ( - NO_DEFAULT, - smuggle_url, + ExtractorError, + extract_attributes, ) +from .dplay import DPlayIE -class Tele5IE(InfoExtractor): + +class Tele5IE(DPlayIE): _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P[^/?#&]+)' _GEO_COUNTRIES = ['DE'] _TESTS = [{ @@ -28,6 +25,7 @@ class Tele5IE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'No longer available: "404 Seite nicht gefunden"', }, { # jwplatform, nexx unavailable 'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/', @@ -42,7 +40,20 @@ class Tele5IE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [JWPlatformIE.ie_key()], + 'skip': 'No longer available, redirects to Filme page', + }, { + 'url': 'https://tele5.de/mediathek/angel-of-mine/', + 'info_dict': { + 'id': '1252360', + 'ext': 'mp4', + 'upload_date': '20220109', + 'timestamp': 1641762000, + 'title': 'Angel of Mine', + 'description': 'md5:a72546a175e1286eb3251843a52d1ad7', + }, + 'params': { + 'format': 'bestvideo', + }, }, { 'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', 'only_matching': True, @@ -64,45 +75,18 @@ class Tele5IE(InfoExtractor): }] def _real_extract(self, url): - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - - NEXX_ID_RE = r'\d{6,}' - JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' - - def nexx_result(nexx_id): - return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, - ie=NexxIE.ie_key(), video_id=nexx_id) - - nexx_id = jwplatform_id = None - - if video_id: - if re.match(NEXX_ID_RE, video_id): - return nexx_result(video_id) - elif re.match(JWPLATFORM_ID_RE, video_id): - jwplatform_id = video_id - - if not nexx_id: - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def extract_id(pattern, name, default=NO_DEFAULT): - return self._html_search_regex( - (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, - r'\s+id\s*=\s*["\']player_(%s)' % pattern, - r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, - default=default) - - nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) - if nexx_id: - return nexx_result(nexx_id) - - if not jwplatform_id: - jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') - - return self.url_result( - smuggle_url( - 'jwplatform:%s' % jwplatform_id, - {'geo_countries': self._GEO_COUNTRIES}), - ie=JWPlatformIE.ie_key(), video_id=jwplatform_id) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_element = self._search_regex(r'(]+?>)', webpage, 'video player') + player_info = extract_attributes(player_element) + asset_id, country, realm = (player_info[x] for x in ('assetid', 'locale', 'realm', )) + endpoint = compat_urlparse.urlparse(player_info['endpoint']).hostname + source_type = player_info.get('sourcetype') + if source_type: + endpoint = '%s-%s' % (source_type, endpoint) + try: + return self._get_disco_api_info(url, asset_id, endpoint, realm, country) + except ExtractorError as e: + if getattr(e, 'message', '') == 'Missing deviceId in context': + raise ExtractorError('DRM protected', cause=e, expected=True) + raise From 4186e817772d49d6f66b07c5ac8c248f026a6446 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 17 Jan 2022 03:13:37 +0000 Subject: [PATCH 29/70] NDR: improve extraction of NDR id, description, etc with current page formats --- youtube_dl/extractor/ndr.py | 45 +++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index ddd828d92..a0d553f00 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -4,8 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, + ExtractorError, int_or_none, merge_dicts, parse_iso8601, @@ -20,13 +22,13 @@ class NDRBaseIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) display_id = next(group for group in mobj.groups() if group) webpage = self._download_webpage(url, display_id) - return self._extract_embed(webpage, display_id) + return self._extract_embed(webpage, display_id, url) class NDRIE(NDRBaseIE): IE_NAME = 'ndr' IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P[^/?#]+),[\da-z]+\.html' + _VALID_URL = r'https?://(?:\w+\.)?ndr\.de/(?:[^/]+/)*(?P[^/?#]+),[\da-z]+\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', @@ -109,19 +111,38 @@ class NDRIE(NDRBaseIE): 'only_matching': True, }] - def _extract_embed(self, webpage, display_id): - embed_url = self._html_search_meta( - 'embedURL', webpage, 'embed URL', - default=None) or self._search_regex( - r'\bembedUrl["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'embed URL', group='url') + def _extract_embed(self, webpage, display_id, url): + embed_url = ( + self._html_search_meta( + 'embedURL', webpage, 'embed URL', + default=None) + or self._search_regex( + r'\bembedUrl["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url', default=None) + or self._search_regex( + r'\bvar\s*sophoraID\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url', default='')) + # some more work needed if we only found sophoraID + if re.match(r'^[a-z]+\d+$', embed_url): + # get the initial part of the url path,. eg /panorama/archiv/2022/ + parsed_url = compat_urllib_parse_urlparse(url) + path = self._search_regex(r'(.+/)%s' % display_id, parsed_url.path or '', 'embed URL', default='') + # find tell-tale image with the actual ID + ndr_id = self._search_regex(r'%s([a-z]+\d+)(?!\.)\b' % (path, ), webpage, 'embed URL', default=None) + # or try to use special knowledge! + NDR_INFO_URL_TPL = 'https://www.ndr.de/info/%s-player.html' + embed_url = 'ndr:%s' % (ndr_id, ) if ndr_id else NDR_INFO_URL_TPL % (embed_url, ) + if not embed_url: + raise ExtractorError('Unable to extract embedUrl') + description = self._search_regex( r']+itemprop="description">([^<]+)

', webpage, 'description', default=None) or self._og_search_description(webpage) timestamp = parse_iso8601( self._search_regex( - r']+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"', - webpage, 'upload date', default=None)) + (r']+itemprop="(?:datePublished|uploadDate)"[^>]+content="(?P[^"]+)"', + r'\bvar\s*pdt\s*=\s*(?P["\'])(?P(?:(?!(?P=q)).)+)(?P=q)', ), + webpage, 'upload date', group='cont', default=None)) info = self._search_json_ld(webpage, display_id, default={}) return merge_dicts({ '_type': 'url_transparent', @@ -179,7 +200,7 @@ class NJoyIE(NDRBaseIE): video_id = self._search_regex( r']+id="pp_([\da-z]+)"', webpage, 'embed id') description = self._search_regex( - r']+class="subline"[^>]*>[^<]+\s*

([^<]+)

', + r']+class="subline"[^>]*>[^<]+\s*

([^<]+)

', webpage, 'description', fatal=False) return { '_type': 'url_transparent', @@ -291,7 +312,7 @@ class NDREmbedBaseIE(InfoExtractor): class NDREmbedIE(NDREmbedBaseIE): IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:player|externalPlayer)\.html' + _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:(?:ard)?player|externalPlayer)\.html' _TESTS = [{ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', From f0a05a55c2ee512880546c056cfbec5ad3399798 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 17 Jan 2022 03:22:32 +0000 Subject: [PATCH 30/70] NJoy: improve extraction of NDR id, description, etc with current page formats --- youtube_dl/extractor/ndr.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index a0d553f00..0a723e3b0 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -196,18 +196,25 @@ class NJoyIE(NDRBaseIE): 'only_matching': True, }] - def _extract_embed(self, webpage, display_id): + def _extract_embed(self, webpage, display_id, url=None): + # find tell-tale URL with the actual ID, or ... video_id = self._search_regex( - r']+id="pp_([\da-z]+)"', webpage, 'embed id') - description = self._search_regex( + (r'''\bsrc\s*=\s*(?:"|')?(?:/\w+)+/([a-z]+\d+)(?!\.)\b''', + r']+id="pp_([\da-z]+)"', ), + webpage, 'NDR id', default=None) + + description = ( + self._html_search_meta('description', webpage) + or self._search_regex( r']+class="subline"[^>]*>[^<]+\s*

([^<]+)

', - webpage, 'description', fatal=False) + webpage, 'description', fatal=False)) return { '_type': 'url_transparent', 'ie_key': 'NDREmbedBase', 'url': 'ndr:%s' % video_id, 'display_id': display_id, 'description': description, + 'title': display_id.replace('-', ' ').strip(), } From 39a98b09a2acf50dc64bc41185be723b98e740b9 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 17 Jan 2022 03:29:43 +0000 Subject: [PATCH 31/70] Fix NDR, NJoy tests --- youtube_dl/extractor/ndr.py | 41 ++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 0a723e3b0..1996d4f96 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -40,13 +40,14 @@ class NDRIE(NDRBaseIE): 'title': 'Party, Pötte und Parade', 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', 'uploader': 'ndrtv', - 'timestamp': 1431108900, + 'timestamp': 1431255671, 'upload_date': '20150510', 'duration': 3498, }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { # httpVideo, different content id 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html', @@ -65,6 +66,7 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpAudio, same content id 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', @@ -76,8 +78,8 @@ class NDRIE(NDRBaseIE): 'title': 'La Valette entgeht der Hinrichtung', 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', 'uploader': 'ndrinfo', - 'timestamp': 1290626100, - 'upload_date': '20140729', + 'timestamp': 1631711863, + 'upload_date': '20210915', 'duration': 884, }, 'params': { @@ -91,9 +93,10 @@ class NDRIE(NDRBaseIE): 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', 'ext': 'mp4', 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', - 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', + 'description': 'md5:700f6de264010585012a72f97b0ac0c9', 'uploader': 'ndrtv', - 'upload_date': '20201113', + 'upload_date': '20201207', + 'timestamp': 1614349457, 'duration': 1749, 'subtitles': { 'de': [{ @@ -174,19 +177,19 @@ class NJoyIE(NDRBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpVideo, different content id 'url': 'http://www.n-joy.de/musik/Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-,felixjaehn168.html', 'md5': '417660fffa90e6df2fda19f1b40a64d8', 'info_dict': { - 'id': 'dockville882', + 'id': 'livestream283', 'display_id': 'Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-', - 'ext': 'mp4', - 'title': '"Ich hab noch nie" mit Felix Jaehn', - 'description': 'md5:85dd312d53be1b99e1f998a16452a2f3', + 'ext': 'mp3', + 'title': 'Das frueheste DJ Set des Nordens live mit Felix Jaehn', + 'description': 'md5:681698f527b8601e511e7b79edde7d2c', 'uploader': 'njoy', - 'upload_date': '20150822', - 'duration': 211, + 'upload_date': '20210830', }, 'params': { 'skip_download': True, @@ -332,6 +335,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'upload_date': '20150907', 'duration': 132, }, + 'skip': 'No longer available', }, { 'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html', 'md5': '002085c44bae38802d94ae5802a36e78', @@ -347,6 +351,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { 'url': 'http://www.ndr.de/info/audio51535-player.html', 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', @@ -356,7 +361,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'title': 'La Valette entgeht der Hinrichtung', 'is_live': False, 'uploader': 'ndrinfo', - 'upload_date': '20140729', + 'upload_date': '20210915', 'duration': 884, }, 'params': { @@ -377,15 +382,17 @@ class NDREmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpVideoLive 'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html', 'info_dict': { 'id': 'livestream217', - 'ext': 'flv', + 'ext': 'mp4', 'title': r're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, - 'upload_date': '20150910', + 'upload_date': '20210409', + 'uploader': 'ndrtv', }, 'params': { 'skip_download': True, @@ -423,9 +430,10 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'ext': 'mp4', 'title': 'Zehn Jahre Reeperbahn Festival - die Doku', 'is_live': False, - 'upload_date': '20150807', + 'upload_date': '20200826', 'duration': 1011, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { # httpAudio 'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html', @@ -442,6 +450,7 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpAudioLive, no explicit ext 'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html', @@ -451,7 +460,7 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'title': r're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, 'uploader': 'njoy', - 'upload_date': '20150810', + 'upload_date': '20210830', }, 'params': { 'skip_download': True, From 01824d275bfa7efbaca274b38c1ddc2b03f12f5d Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 19 Jan 2022 13:24:33 +0000 Subject: [PATCH 32/70] Additional tweaks: allow any .ndr.de, simplify quote match --- youtube_dl/extractor/ndr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 1996d4f96..26627f8b0 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -28,7 +28,7 @@ class NDRBaseIE(InfoExtractor): class NDRIE(NDRBaseIE): IE_NAME = 'ndr' IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://(?:\w+\.)?ndr\.de/(?:[^/]+/)*(?P[^/?#]+),[\da-z]+\.html' + _VALID_URL = r'https?://(?:\w+\.)*ndr\.de/(?:[^/]+/)*(?P[^/?#]+),[\da-z]+\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', @@ -202,7 +202,7 @@ class NJoyIE(NDRBaseIE): def _extract_embed(self, webpage, display_id, url=None): # find tell-tale URL with the actual ID, or ... video_id = self._search_regex( - (r'''\bsrc\s*=\s*(?:"|')?(?:/\w+)+/([a-z]+\d+)(?!\.)\b''', + (r'''\bsrc\s*=\s*["']?(?:/\w+)+/([a-z]+\d+)(?!\.)\b''', r']+id="pp_([\da-z]+)"', ), webpage, 'NDR id', default=None) @@ -322,7 +322,7 @@ class NDREmbedBaseIE(InfoExtractor): class NDREmbedIE(NDREmbedBaseIE): IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:(?:ard)?player|externalPlayer)\.html' + _VALID_URL = r'https?://(?:\w+\.)*ndr\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:(?:ard)?player|externalPlayer)\.html' _TESTS = [{ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', From 5197336de6ee2d18c37732f3f7c6532c8899ec29 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 14 Jan 2022 20:14:14 +0000 Subject: [PATCH 33/70] Support more deeply nested ptmd_path with test, update tests --- youtube_dl/extractor/zdf.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 4dd56f66d..3d39bb33a 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( determine_ext, + ExtractorError, float_or_none, int_or_none, merge_dicts, @@ -145,6 +146,7 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1613948400, 'upload_date': '20210221', }, + 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', }, { # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', @@ -158,6 +160,7 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1608604200, 'upload_date': '20201222', }, + 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', }, { 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', 'info_dict': { @@ -190,6 +193,17 @@ class ZDFIE(ZDFBaseIE): }, { 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/arte/todliche-flucht/page-video-artede-toedliche-flucht-16-100.html', + 'info_dict': { + 'id': 'video_artede_083871-001-A', + 'ext': 'mp4', + 'title': 'Tödliche Flucht (1/6)', + 'description': 'md5:e34f96a9a5f8abd839ccfcebad3d5315', + 'duration': 3193.0, + 'timestamp': 1641355200, + 'upload_date': '20220105', + }, }] def _extract_entry(self, url, player, content, video_id): @@ -197,12 +211,18 @@ class ZDFIE(ZDFBaseIE): t = content['mainVideoContent']['http://zdf.de/rels/target'] - ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') + def get_ptmd_path(d): + return ( + d.get('http://zdf.de/rels/streams/ptmd') + or d.get('http://zdf.de/rels/streams/ptmd-template', + '').replace('{playerId}', 'ngplayer_2_4')) + + ptmd_path = get_ptmd_path(try_get(t, lambda x: x['streams']['default'], dict) or {}) + if not ptmd_path: + ptmd_path = get_ptmd_path(t) if not ptmd_path: - ptmd_path = t[ - 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'ngplayer_2_4') + raise ExtractorError('Could not extract ptmd_path') info = self._extract_ptmd( urljoin(url, ptmd_path), video_id, player['apiToken'], url) From 5cb4833f408745135d1b0e178b9a2545a899f2ac Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 13 Jan 2022 19:38:08 +0000 Subject: [PATCH 34/70] Update URPlayIE extractor for Next.js page format, with subtitles --- youtube_dl/extractor/urplay.py | 52 ++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index d6c79147e..abd2bee84 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( dict_get, + ExtractorError, int_or_none, + ISO639Utils, + parse_age_limit, + try_get, unified_timestamp, ) @@ -23,9 +27,10 @@ class URPlayIE(InfoExtractor): 'upload_date': '20171214', 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', 'duration': 2269, - 'categories': ['Kultur & historia'], + 'categories': ['Vetenskap & teknik'], 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'], 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', + 'age_limit': 15, }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -50,11 +55,19 @@ class URPlayIE(InfoExtractor): video_id = self._match_id(url) url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - vid = int(video_id) - accessible_episodes = self._parse_json(self._html_search_regex( - r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['accessibleEpisodes'] - urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid) + urplayer_data = self._search_regex( + r'(?s)\bid\s*=\s*"__NEXT_DATA__"[^>]*>\s*({.+?})\s*]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['accessibleEpisodes'] + urplayer_data = next(e for e in accessible_episodes if e.get('id') == int_or_none(video_id)) episode = urplayer_data['title'] raw_streaming_info = urplayer_data['streamingInfo']['raw'] host = self._download_json( @@ -72,6 +85,30 @@ class URPlayIE(InfoExtractor): video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) self._sort_formats(formats) + subtitles = {} + + def parse_lang_code(code): + "3-character language code or None (utils candidate)" + if code is None: + return + lang = code.lower() + if not ISO639Utils.long2short(lang): + lang = ISO639Utils.short2long(lang) + return lang or None + + for k, v in (urplayer_data['streamingInfo'].get('sweComplete') or {}).items(): + if (k in ('sd', 'hd') or not isinstance(v, dict)): + continue + lang, sttl_url = (v.get(kk) for kk in ('language', 'location', )) + if not sttl_url: + continue + lang = parse_lang_code(lang) + if not lang: + continue + sttl = subtitles.get(lang) or [] + sttl.append({'ext': k, 'url': sttl_url, }) + subtitles[lang] = sttl + image = urplayer_data.get('image') or {} thumbnails = [] for k, v in image.items(): @@ -104,4 +141,7 @@ class URPlayIE(InfoExtractor): 'season': series.get('label'), 'episode': episode, 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), + 'age_limit': parse_age_limit(min(try_get(a, lambda x: x['from'], int) or 0 + for a in urplayer_data.get('ageRanges', []))), + 'subtitles': subtitles, } From 568c7005d513d0398c20b9e88eb9838c68651fc2 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 25 Jan 2022 12:59:31 +0000 Subject: [PATCH 35/70] Fix WDRMaus; extend URL matching for other Maus pages; improve ID extraction --- youtube_dl/extractor/wdr.py | 39 +++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 2903d189e..a5488f3fd 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -10,6 +10,7 @@ from ..compat import ( ) from ..utils import ( determine_ext, + dict_get, ExtractorError, js_to_json, strip_jsonp, @@ -22,9 +23,10 @@ from ..utils import ( class WDRIE(InfoExtractor): - _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P\d+)\.js' + __API_URL_TPL = '//deviceids-medp.wdr.de/ondemand/%s/%s' + _VALID_URL = (r'(?:https?:' + __API_URL_TPL) % (r'\d+', r'(?=\d+\.js)|wdr:)(?P\d{6,})') _GEO_COUNTRIES = ['DE'] - _TEST = { + _TESTS = [{ 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', 'info_dict': { 'id': 'mdb-1557833', @@ -32,11 +34,20 @@ class WDRIE(InfoExtractor): 'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe', 'upload_date': '20180112', }, - } + }, + ] + + def _asset_url(self, wdr_id): + id_len = max(len(wdr_id), 5) + return ''.join(('https:', self.__API_URL_TPL % (wdr_id[:id_len - 4], wdr_id, ), '.js')) def _real_extract(self, url): video_id = self._match_id(url) + if url.startswith('wdr:'): + video_id = url[4:] + url = self._asset_url(video_id) + metadata = self._download_json( url, video_id, transform_source=strip_jsonp) @@ -115,10 +126,10 @@ class WDRIE(InfoExtractor): } -class WDRPageIE(InfoExtractor): - _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' +class WDRPageIE(WDRIE): + _MAUS_REGEX = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/)*?(?P[^/?#.]+)(?:/?|/index\.php5|\.php5)$' _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P[^/]+)\.html' - _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _MAUS_REGEX _TESTS = [ { @@ -180,12 +191,12 @@ class WDRPageIE(InfoExtractor): { 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', 'info_dict': { - 'id': 'mdb-1552552', + 'id': 'mdb-2627637', 'ext': 'mp4', 'upload_date': 're:^[0-9]{8}$', - 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', + 'title': 're:^Die Sendung (?:mit der Maus )?vom [0-9.]{10}$', }, - 'skip': 'The id changes from week to week because of the new episode' + # 'skip': 'The id changes from week to week because of the new episode' }, { 'url': 'http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5', @@ -234,7 +245,7 @@ class WDRPageIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') + display_id = dict_get(mobj.groupdict(), ('display_id', 'maus_id'), 'wdrmaus') webpage = self._download_webpage(url, display_id) entries = [] @@ -260,6 +271,14 @@ class WDRPageIE(InfoExtractor): jsonp_url = try_get( media_link_obj, lambda x: x['mediaObj']['url'], compat_str) if jsonp_url: + # metadata, or player JS with ['ref'] giving WDR id, or just media, perhaps + clip_id = media_link_obj['mediaObj'].get('ref') + if jsonp_url.endswith('.assetjsonp'): + asset = self._download_json( + jsonp_url, display_id, fatal=False, transform_source=strip_jsonp) + clip_id = try_get(asset, lambda x: x['trackerData']['trackerClipId'], compat_str) + if clip_id: + jsonp_url = self._asset_url(clip_id[4:]) entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key())) # Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html) From 96423449659131ed8e7bfaa7f791466c3f8f2db1 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 25 Jan 2022 13:04:04 +0000 Subject: [PATCH 36/70] Fix tests for working IEs; disable obsolete WDRMobile --- youtube_dl/extractor/wdr.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index a5488f3fd..10db73148 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -170,11 +170,11 @@ class WDRPageIE(WDRIE): { 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { - 'id': 'mdb-1406149', + 'id': 'mdb-2296252', 'ext': 'mp4', - 'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': r're:^WDR Fernsehen im Livestream (?:\(nur in Deutschland erreichbar\) )?[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'alt_title': 'WDR Fernsehen Live', - 'upload_date': '20150101', + 'upload_date': '20201112', 'is_live': True, }, 'params': { @@ -183,7 +183,7 @@ class WDRPageIE(WDRIE): }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', - 'playlist_mincount': 7, + 'playlist_mincount': 6, 'info_dict': { 'id': 'aktuelle-stunde-120', }, @@ -196,7 +196,7 @@ class WDRPageIE(WDRIE): 'upload_date': 're:^[0-9]{8}$', 'title': 're:^Die Sendung (?:mit der Maus )?vom [0-9.]{10}$', }, - # 'skip': 'The id changes from week to week because of the new episode' + 'skip': 'The id changes from week to week because of the new episode' }, { 'url': 'http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5', @@ -207,6 +207,7 @@ class WDRPageIE(WDRIE): 'upload_date': '20130919', 'title': 'Sachgeschichte - Achterbahn ', }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', @@ -232,6 +233,7 @@ class WDRPageIE(WDRIE): 'params': { 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', @@ -298,16 +300,14 @@ class WDRPageIE(WDRIE): class WDRElefantIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P.+)' _TEST = { - 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', + 'url': 'http://www.wdrmaus.de/elefantenseite/#elefantenkino_wippe', + # adaptive stream: unstable file MD5 'info_dict': { - 'title': 'Folge Oster-Spezial 2015', - 'id': 'mdb-1088195', + 'title': 'Wippe', + 'id': 'mdb-1198320', 'ext': 'mp4', 'age_limit': None, - 'upload_date': '20150406' - }, - 'params': { - 'skip_download': True, + 'upload_date': '20071003' }, } @@ -342,6 +342,7 @@ class WDRMobileIE(InfoExtractor): /[0-9]+/[0-9]+/ (?P[0-9]+)_(?P[0-9]+)''' IE_NAME = 'wdr:mobile' + _WORKING = False # no such domain _TEST = { 'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4', 'info_dict': { From 23ad6402a6966dd09e4c854f32c33f69be1a064e Mon Sep 17 00:00:00 2001 From: Chris Rose <offline@offby1.net> Date: Fri, 26 Nov 2021 08:08:17 -0800 Subject: [PATCH 37/70] xvideos: Fix for #30271 --- youtube_dl/extractor/xvideos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 8fc64914c..e63d4690d 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -82,7 +82,7 @@ class XVideosIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'https://www.xvideos.com/video%s/' % video_id, video_id) + 'https://www.xvideos.com/video%s/0' % video_id, video_id) mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) if mobj: From 005339d6375f2d2a4cec962b1c1a157c1dffbf8f Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 8 Dec 2021 23:37:54 +0000 Subject: [PATCH 38/70] [applepodcasts] Support new AMP-ish page structure --- youtube_dl/extractor/applepodcasts.py | 43 ++++++++++++++++++++------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py index 6a74de758..f0186d4bf 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/youtube_dl/extractor/applepodcasts.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, clean_podcast_url, + get_element_by_class, int_or_none, parse_iso8601, try_get, @@ -14,15 +16,15 @@ class ApplePodcastsIE(InfoExtractor): _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)' _TESTS = [{ 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', - 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'md5': '41dc31cd650143e530d9423b6b5a344f', 'info_dict': { 'id': '1000482637777', 'ext': 'mp3', 'title': '207 - Whitney Webb Returns', - 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6', 'upload_date': '20200705', - 'timestamp': 1593921600, - 'duration': 6425, + 'timestamp': 1593932400, + 'duration': 6454, 'series': 'The Tim Dillon Show', } }, { @@ -39,17 +41,38 @@ class ApplePodcastsIE(InfoExtractor): def _real_extract(self, url): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) - ember_data = self._parse_json(self._search_regex( - r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', - webpage, 'ember data'), episode_id) - ember_data = ember_data.get(episode_id) or ember_data - episode = ember_data['data']['attributes'] + episode_data = {} + ember_data = {} + # new page type 2021-11 + amp_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<', + webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {} + amp_data = try_get(amp_data, + lambda a: self._parse_json( + next(a[x] for x in iter(a) if episode_id in x), + episode_id), + dict) or {} + amp_data = amp_data.get('d') or [] + episode_data = try_get( + amp_data, + lambda a: next(x for x in a + if x['type'] == 'podcast-episodes' and x['id'] == episode_id), + dict) + if not episode_data: + # try pre 2021-11 page type: TODO: consider deleting if no longer used + ember_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) or {} + ember_data = ember_data.get(episode_id) or ember_data + episode_data = try_get(ember_data, lambda x: x['data'], dict) + episode = episode_data['attributes'] description = episode.get('description') or {} series = None - for inc in (ember_data.get('included') or []): + for inc in (amp_data or ember_data.get('included') or []): if inc.get('type') == 'media/podcast': series = try_get(inc, lambda x: x['attributes']['name']) + series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) return { 'id': episode_id, From e00b0eab1e78ed822683b2689f60eab85514ac42 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 9 Dec 2021 00:55:04 +0000 Subject: [PATCH 39/70] [applepodcasts] Improve format extraction Set acodec and vcodec, etc, to avoid breaking, eg, bestaudio --- youtube_dl/extractor/applepodcasts.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py index f0186d4bf..dd413a289 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/youtube_dl/extractor/applepodcasts.py @@ -7,6 +7,7 @@ from ..utils import ( clean_podcast_url, get_element_by_class, int_or_none, + parse_codecs, parse_iso8601, try_get, ) @@ -74,7 +75,7 @@ class ApplePodcastsIE(InfoExtractor): series = try_get(inc, lambda x: x['attributes']['name']) series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) - return { + info = [{ 'id': episode_id, 'title': episode['name'], 'url': clean_podcast_url(episode['assetUrl']), @@ -82,4 +83,9 @@ class ApplePodcastsIE(InfoExtractor): 'timestamp': parse_iso8601(episode.get('releaseDateTime')), 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), 'series': series, - } + }] + self._sort_formats(info) + info = info[0] + codecs = parse_codecs(info.get('ext', 'mp3')) + info.update(codecs) + return info From 584715a803eef68f68fbbb8b72a022a699983197 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 9 Dec 2021 01:35:35 +0000 Subject: [PATCH 40/70] [applepodcasts] Extract default thumbnail image --- youtube_dl/extractor/applepodcasts.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py index dd413a289..95e0f663c 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/youtube_dl/extractor/applepodcasts.py @@ -27,6 +27,7 @@ class ApplePodcastsIE(InfoExtractor): 'timestamp': 1593932400, 'duration': 6454, 'series': 'The Tim Dillon Show', + 'thumbnail': 're:.+[.](png|jpe?g|webp)', } }, { 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', @@ -83,6 +84,7 @@ class ApplePodcastsIE(InfoExtractor): 'timestamp': parse_iso8601(episode.get('releaseDateTime')), 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), 'series': series, + 'thumbnail': self._og_search_thumbnail(webpage), }] self._sort_formats(info) info = info[0] From 73e1ab6125eeea2b07942326cd2f1d6d9adff64e Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 6 Dec 2021 19:26:33 +0000 Subject: [PATCH 41/70] [test:download] Only extract enough videos for playlist_mincount --- test/parameters.json | 1 - test/test_download.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parameters.json b/test/parameters.json index 65fd54428..864c9d130 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -18,7 +18,6 @@ "noprogress": false, "outtmpl": "%(id)s.%(ext)s", "password": null, - "playlistend": -1, "playliststart": 1, "prefer_free_formats": false, "quiet": false, diff --git a/test/test_download.py b/test/test_download.py index ebe820dfc..8e43cfa12 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -121,6 +121,7 @@ def generator(test_case, tname): params['outtmpl'] = tname + '_' + params['outtmpl'] if is_playlist and 'playlist' not in test_case: params.setdefault('extract_flat', 'in_playlist') + params.setdefault('playlistend', test_case.get('playlist_mincount')) params.setdefault('skip_download', True) ydl = YoutubeDL(params, auto_init=False) From 91278f4b6b5600e9ce65826ec9e7e38e7dba5937 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 6 Dec 2021 20:52:21 +0000 Subject: [PATCH 42/70] [niconico] Back-port extractor from yt-dlp Add Nico search extractors, fix extraction --- youtube_dl/extractor/extractors.py | 9 +- youtube_dl/extractor/niconico.py | 646 +++++++++++++++++++++-------- 2 files changed, 477 insertions(+), 178 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4e9954c6a..e70daf2b1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -789,7 +789,14 @@ from .nick import ( NickNightIE, NickRuIE, ) -from .niconico import NiconicoIE, NiconicoPlaylistIE +from .niconico import ( + NiconicoIE, + NiconicoPlaylistIE, + NiconicoUserIE, + NicovideoSearchIE, + NicovideoSearchDateIE, + NicovideoSearchURLIE, +) from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index a85fc3d5c..756ad0e25 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -2,25 +2,28 @@ from __future__ import unicode_literals import datetime -import functools +import itertools import json -import math +import re -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor +from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..compat import ( compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( - determine_ext, - dict_get, ExtractorError, + dict_get, float_or_none, - InAdvancePagedList, int_or_none, + OnDemandPagedList, parse_duration, parse_iso8601, + PostProcessingError, remove_start, + str_or_none, try_get, unified_timestamp, urlencode_postdata, @@ -34,7 +37,7 @@ class NiconicoIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', - 'md5': 'd1a75c0823e2f629128c43e1212760f9', + 'md5': 'a5bad06f1347452102953f323c69da34s', 'info_dict': { 'id': 'sm22312215', 'ext': 'mp4', @@ -162,6 +165,11 @@ class NiconicoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0' + } + def _real_initialize(self): self._login() @@ -191,37 +199,89 @@ class NiconicoIE(InfoExtractor): self._downloader.report_warning('unable to log in: bad username or password') return login_ok - def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): - def yesno(boolean): - return 'yes' if boolean else 'no' + def _get_heartbeat_info(self, info_dict): - session_api_data = api_data['video']['dmcInfo']['session_api'] - session_api_endpoint = session_api_data['urls'][0] + video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/') - format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) + api_data = ( + info_dict.get('_api_data') + or self._parse_json( + self._html_search_regex( + 'data-api-data="([^"]+)"', + self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id), + 'API data', default='{}'), + video_id)) + + session_api_data = try_get(api_data, lambda x: x['media']['delivery']['movie']['session']) + session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0]) + + def ping(): + status = try_get( + self._download_json( + 'https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', video_id, + query={'t': try_get(api_data, lambda x: x['media']['delivery']['trackingId'])}, + note='Acquiring permission for downloading video', + headers=self._API_HEADERS), + lambda x: x['meta']['status']) + if status != 200: + self.report_warning('Failed to acquire permission for playing video. The video may not download.') + + yesno = lambda x: 'yes' if x else 'no' + + # m3u8 (encryption) + if try_get(api_data, lambda x: x['media']['delivery']['encryption']) is not None: + protocol = 'm3u8' + encryption = self._parse_json(session_api_data['token'], video_id)['hls_encryption'] + session_api_http_parameters = { + 'parameters': { + 'hls_parameters': { + 'encryption': { + encryption: { + 'encrypted_key': try_get(api_data, lambda x: x['media']['delivery']['encryption']['encryptedKey']), + 'key_uri': try_get(api_data, lambda x: x['media']['delivery']['encryption']['keyUri']) + } + }, + 'transfer_preset': '', + 'use_ssl': yesno(session_api_endpoint['isSsl']), + 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']), + 'segment_duration': 6000, + } + } + } + # http + else: + protocol = 'http' + session_api_http_parameters = { + 'parameters': { + 'http_output_download_parameters': { + 'use_ssl': yesno(session_api_endpoint['isSsl']), + 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']), + } + } + } session_response = self._download_json( session_api_endpoint['url'], video_id, query={'_format': 'json'}, headers={'Content-Type': 'application/json'}, - note='Downloading JSON metadata for %s' % format_id, + note='Downloading JSON metadata for %s' % info_dict['format_id'], data=json.dumps({ 'session': { 'client_info': { - 'player_id': session_api_data['player_id'], + 'player_id': session_api_data.get('playerId'), }, 'content_auth': { - 'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]], - 'content_key_timeout': session_api_data['content_key_timeout'], + 'auth_type': try_get(session_api_data, lambda x: x['authTypes'][session_api_data['protocols'][0]]), + 'content_key_timeout': session_api_data.get('contentKeyTimeout'), 'service_id': 'nicovideo', - 'service_user_id': session_api_data['service_user_id'] + 'service_user_id': session_api_data.get('serviceUserId') }, - 'content_id': session_api_data['content_id'], + 'content_id': session_api_data.get('contentId'), 'content_src_id_sets': [{ 'content_src_ids': [{ 'src_id_to_mux': { - 'audio_src_ids': [audio_quality['id']], - 'video_src_ids': [video_quality['id']], + 'audio_src_ids': [audio_src_id], + 'video_src_ids': [video_src_id], } }] }], @@ -229,52 +289,81 @@ class NiconicoIE(InfoExtractor): 'content_uri': '', 'keep_method': { 'heartbeat': { - 'lifetime': session_api_data['heartbeat_lifetime'] + 'lifetime': session_api_data.get('heartbeatLifetime') } }, - 'priority': session_api_data['priority'], + 'priority': session_api_data.get('priority'), 'protocol': { 'name': 'http', 'parameters': { - 'http_parameters': { - 'parameters': { - 'http_output_download_parameters': { - 'use_ssl': yesno(session_api_endpoint['is_ssl']), - 'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']), - } - } - } + 'http_parameters': session_api_http_parameters } }, - 'recipe_id': session_api_data['recipe_id'], + 'recipe_id': session_api_data.get('recipeId'), 'session_operation_auth': { 'session_operation_auth_by_signature': { - 'signature': session_api_data['signature'], - 'token': session_api_data['token'], + 'signature': session_api_data.get('signature'), + 'token': session_api_data.get('token'), } }, 'timing_constraint': 'unlimited' } }).encode()) - resolution = video_quality.get('resolution', {}) + info_dict['url'] = session_response['data']['session']['content_uri'] + info_dict['protocol'] = protocol + + # get heartbeat info + heartbeat_info_dict = { + 'url': session_api_endpoint['url'] + '/' + session_response['data']['session']['id'] + '?_format=json&_method=PUT', + 'data': json.dumps(session_response['data']), + # interval, convert milliseconds to seconds, then halve to make a buffer. + 'interval': float_or_none(session_api_data.get('heartbeatLifetime'), scale=3000), + 'ping': ping + } + + return info_dict, heartbeat_info_dict + + def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): + def parse_format_id(id_code): + mobj = re.match(r'''(?x) + (?:archive_)? + (?:(?P<codec>[^_]+)_)? + (?:(?P<br>[\d]+)kbps_)? + (?:(?P<res>[\d+]+)p_)? + ''', '%s_' % id_code) + return mobj.groupdict() if mobj else {} + + protocol = 'niconico_dmc' + format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) + vdict = parse_format_id(video_quality['id']) + adict = parse_format_id(audio_quality['id']) + resolution = try_get(video_quality, lambda x: x['metadata']['resolution'], dict) or {'height': vdict.get('res')} + vbr = try_get(video_quality, lambda x: x['metadata']['bitrate'], float) return { - 'url': session_response['data']['session']['content_uri'], + 'url': '%s:%s/%s/%s' % (protocol, video_id, video_quality['id'], audio_quality['id']), 'format_id': format_id, + 'format_note': 'DMC %s' % try_get(video_quality, lambda x: x['metadata']['label'], compat_str), 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 - 'abr': float_or_none(audio_quality.get('bitrate'), 1000), - 'vbr': float_or_none(video_quality.get('bitrate'), 1000), - 'height': resolution.get('height'), - 'width': resolution.get('width'), + 'vcodec': vdict.get('codec'), + 'acodec': adict.get('codec'), + 'vbr': float_or_none(vbr, 1000) or float_or_none(vdict.get('br')), + 'abr': float_or_none(audio_quality.get('bitrate'), 1000) or float_or_none(adict.get('br')), + 'height': int_or_none(resolution.get('height', vdict.get('res'))), + 'width': int_or_none(resolution.get('width')), + 'quality': -2 if 'low' in format_id else -1, # Default quality value is -1 + 'protocol': protocol, + 'http_headers': { + 'Origin': 'https://www.nicovideo.jp', + 'Referer': 'https://www.nicovideo.jp/watch/' + video_id, + } } def _real_extract(self, url): video_id = self._match_id(url) - # Get video webpage. We are not actually interested in it for normal - # cases, but need the cookies in order to be able to download the - # info webpage + # Get video webpage for API data. webpage, handle = self._download_webpage_handle( 'http://www.nicovideo.jp/watch/' + video_id, video_id) if video_id.startswith('so'): @@ -284,86 +373,136 @@ class NiconicoIE(InfoExtractor): 'data-api-data="([^"]+)"', webpage, 'API data', default='{}'), video_id) - def _format_id_from_url(video_url): - return 'economy' if video_real_url.endswith('low') else 'normal' + def get_video_info_web(items): + return dict_get(api_data['video'], items) - try: - video_real_url = api_data['video']['smileInfo']['url'] - except KeyError: # Flash videos - # Get flv info - flv_info_webpage = self._download_webpage( - 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', - video_id, 'Downloading flv info') + # Get video info + video_info_xml = self._download_xml( + 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, + video_id, note='Downloading video info page') - flv_info = compat_parse_qs(flv_info_webpage) - if 'url' not in flv_info: - if 'deleted' in flv_info: - raise ExtractorError('The video has been deleted.', - expected=True) - elif 'closed' in flv_info: - raise ExtractorError('Niconico videos now require logging in', - expected=True) - elif 'error' in flv_info: - raise ExtractorError('%s reports error: %s' % ( - self.IE_NAME, flv_info['error'][0]), expected=True) - else: - raise ExtractorError('Unable to find video URL') + def get_video_info_xml(items): + if not isinstance(items, list): + items = [items] + for item in items: + ret = xpath_text(video_info_xml, './/' + item) + if ret: + return ret - video_info_xml = self._download_xml( - 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, - video_id, note='Downloading video info page') + if get_video_info_xml('error'): + error_code = get_video_info_xml('code') - def get_video_info(items): - if not isinstance(items, list): - items = [items] - for item in items: - ret = xpath_text(video_info_xml, './/' + item) - if ret: - return ret + if error_code == 'DELETED': + raise ExtractorError('The video has been deleted.', + expected=True) + elif error_code == 'NOT_FOUND': + raise ExtractorError('The video is not found.', + expected=True) + elif error_code == 'COMMUNITY': + self.to_screen('%s: The video is community members only.' % video_id) + else: + raise ExtractorError('%s reports error: %s' % (self.IE_NAME, error_code)) - video_real_url = flv_info['url'][0] + # Start extracting video formats + formats = [] - extension = get_video_info('movie_type') - if not extension: - extension = determine_ext(video_real_url) + # Get HTML5 videos info + quality_info = try_get(api_data, lambda x: x['media']['delivery']['movie']) + if not quality_info: + raise ExtractorError('The video can\'t be downloaded', expected=True) - formats = [{ - 'url': video_real_url, - 'ext': extension, - 'format_id': _format_id_from_url(video_real_url), - }] - else: - formats = [] + for audio_quality in quality_info.get('audios') or {}: + for video_quality in quality_info.get('videos') or {}: + if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): + continue + formats.append(self._extract_format_for_quality( + api_data, video_id, audio_quality, video_quality)) - dmc_info = api_data['video'].get('dmcInfo') - if dmc_info: # "New" HTML5 videos - quality_info = dmc_info['quality'] - for audio_quality in quality_info['audios']: - for video_quality in quality_info['videos']: - if not audio_quality['available'] or not video_quality['available']: - continue - formats.append(self._extract_format_for_quality( - api_data, video_id, audio_quality, video_quality)) + # Get flv/swf info + timestamp = None + video_real_url = try_get(api_data, lambda x: x['video']['smileInfo']['url']) + if video_real_url: + is_economy = video_real_url.endswith('low') - self._sort_formats(formats) - else: # "Old" HTML5 videos - formats = [{ + if is_economy: + self.report_warning('Site is currently in economy mode! You will only have access to lower quality streams') + + # Invoking ffprobe to determine resolution + pp = FFmpegPostProcessor(self._downloader) + cookies = self._get_cookies('https://nicovideo.jp').output(header='', sep='; path=/; domain=nicovideo.jp;\n') + + self.to_screen('%s: %s' % (video_id, 'Checking smile format with ffprobe')) + + try: + metadata = pp.get_metadata_object(video_real_url, ['-cookies', cookies]) + except PostProcessingError as err: + raise ExtractorError(err.msg, expected=True) + + v_stream = a_stream = {} + + # Some complex swf files doesn't have video stream (e.g. nm4809023) + for stream in metadata['streams']: + if stream['codec_type'] == 'video': + v_stream = stream + elif stream['codec_type'] == 'audio': + a_stream = stream + + # Community restricted videos seem to have issues with the thumb API not returning anything at all + filesize = int( + (get_video_info_xml('size_high') if not is_economy else get_video_info_xml('size_low')) + or metadata['format']['size'] + ) + extension = ( + get_video_info_xml('movie_type') + or 'mp4' if 'mp4' in metadata['format']['format_name'] else metadata['format']['format_name'] + ) + + # 'creation_time' tag on video stream of re-encoded SMILEVIDEO mp4 files are '1970-01-01T00:00:00.000000Z'. + timestamp = ( + parse_iso8601(get_video_info_web('first_retrieve')) + or unified_timestamp(get_video_info_web('postedDateTime')) + ) + metadata_timestamp = ( + parse_iso8601(try_get(v_stream, lambda x: x['tags']['creation_time'])) + or timestamp if extension != 'mp4' else 0 + ) + + # According to compconf, smile videos from pre-2017 are always better quality than their DMC counterparts + smile_threshold_timestamp = parse_iso8601('2016-12-08T00:00:00+09:00') + + is_source = timestamp < smile_threshold_timestamp or metadata_timestamp > 0 + + # If movie file size is unstable, old server movie is not source movie. + if filesize > 1: + formats.append({ 'url': video_real_url, - 'ext': 'mp4', - 'format_id': _format_id_from_url(video_real_url), - }] + 'format_id': 'smile' if not is_economy else 'smile_low', + 'format_note': 'SMILEVIDEO source' if not is_economy else 'SMILEVIDEO low quality', + 'ext': extension, + 'container': extension, + 'vcodec': v_stream.get('codec_name'), + 'acodec': a_stream.get('codec_name'), + # Some complex swf files doesn't have total bit rate metadata (e.g. nm6049209) + 'tbr': int_or_none(metadata['format'].get('bit_rate'), scale=1000), + 'vbr': int_or_none(v_stream.get('bit_rate'), scale=1000), + 'abr': int_or_none(a_stream.get('bit_rate'), scale=1000), + 'height': int_or_none(v_stream.get('height')), + 'width': int_or_none(v_stream.get('width')), + 'source_preference': 5 if not is_economy else -2, + 'quality': 5 if is_source and not is_economy else None, + 'filesize': filesize + }) - def get_video_info(items): - return dict_get(api_data['video'], items) + self._sort_formats(formats) # Start extracting information - title = get_video_info('title') - if not title: - title = self._og_search_title(webpage, default=None) - if not title: - title = self._html_search_regex( + title = ( + get_video_info_xml('title') # prefer to get the untranslated original title + or get_video_info_web(['originalTitle', 'title']) + or self._og_search_title(webpage, default=None) + or self._html_search_regex( r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>', - webpage, 'video title') + webpage, 'video title')) watch_api_data_string = self._html_search_regex( r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>', @@ -372,14 +511,15 @@ class NiconicoIE(InfoExtractor): video_detail = watch_api_data.get('videoDetail', {}) thumbnail = ( - get_video_info(['thumbnail_url', 'thumbnailURL']) + self._html_search_regex(r'<meta property="og:image" content="([^"]+)">', webpage, 'thumbnail data', default=None) + or dict_get( # choose highest from 720p to 240p + get_video_info_web('thumbnail'), + ['ogp', 'player', 'largeUrl', 'middleUrl', 'url']) or self._html_search_meta('image', webpage, 'thumbnail', default=None) or video_detail.get('thumbnail')) - description = get_video_info('description') + description = get_video_info_web('description') - timestamp = (parse_iso8601(get_video_info('first_retrieve')) - or unified_timestamp(get_video_info('postedDateTime'))) if not timestamp: match = self._html_search_meta('datePublished', webpage, 'date published', default=None) if match: @@ -388,19 +528,25 @@ class NiconicoIE(InfoExtractor): timestamp = parse_iso8601( video_detail['postedAt'].replace('/', '-'), delimiter=' ', timezone=datetime.timedelta(hours=9)) + timestamp = timestamp or try_get(api_data, lambda x: parse_iso8601(x['video']['registeredAt'])) - view_count = int_or_none(get_video_info(['view_counter', 'viewCount'])) + view_count = int_or_none(get_video_info_web(['view_counter', 'viewCount'])) if not view_count: match = self._html_search_regex( r'>Views: <strong[^>]*>([^<]+)</strong>', webpage, 'view count', default=None) if match: view_count = int_or_none(match.replace(',', '')) - view_count = view_count or video_detail.get('viewCount') + view_count = ( + view_count + or video_detail.get('viewCount') + or try_get(api_data, lambda x: x['video']['count']['view'])) + + comment_count = ( + int_or_none(get_video_info_web('comment_num')) + or video_detail.get('commentCount') + or try_get(api_data, lambda x: x['video']['count']['comment'])) - comment_count = (int_or_none(get_video_info('comment_num')) - or video_detail.get('commentCount') - or try_get(api_data, lambda x: x['thread']['commentCount'])) if not comment_count: match = self._html_search_regex( r'>Comments: <strong[^>]*>([^<]+)</strong>', @@ -409,22 +555,41 @@ class NiconicoIE(InfoExtractor): comment_count = int_or_none(match.replace(',', '')) duration = (parse_duration( - get_video_info('length') + get_video_info_web('length') or self._html_search_meta( 'video:duration', webpage, 'video duration', default=None)) or video_detail.get('length') - or get_video_info('duration')) + or get_video_info_web('duration')) - webpage_url = get_video_info('watch_url') or url + webpage_url = get_video_info_web('watch_url') or url + + # for channel movie and community movie + channel_id = try_get( + api_data, + (lambda x: x['channel']['globalId'], + lambda x: x['community']['globalId'])) + channel = try_get( + api_data, + (lambda x: x['channel']['name'], + lambda x: x['community']['name'])) # Note: cannot use api_data.get('owner', {}) because owner may be set to "null" # in the JSON, which will cause None to be returned instead of {}. owner = try_get(api_data, lambda x: x.get('owner'), dict) or {} - uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id') - uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname') + uploader_id = str_or_none( + get_video_info_web(['ch_id', 'user_id']) + or owner.get('id') + or channel_id + ) + uploader = ( + get_video_info_web(['ch_name', 'user_nickname']) + or owner.get('nickname') + or channel + ) return { 'id': video_id, + '_api_data': api_data, 'title': title, 'formats': formats, 'thumbnail': thumbnail, @@ -432,6 +597,8 @@ class NiconicoIE(InfoExtractor): 'uploader': uploader, 'timestamp': timestamp, 'uploader_id': uploader_id, + 'channel': channel, + 'channel_id': channel_id, 'view_count': view_count, 'comment_count': comment_count, 'duration': duration, @@ -440,7 +607,7 @@ class NiconicoIE(InfoExtractor): class NiconicoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/)?mylist/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/|my/)?mylist/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.nicovideo.jp/mylist/27411728', @@ -456,60 +623,185 @@ class NiconicoPlaylistIE(InfoExtractor): 'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728', 'only_matching': True, }] - _PAGE_SIZE = 100 - def _call_api(self, list_id, resource, query): - return self._download_json( - 'https://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, - 'Downloading %s JSON metatdata' % resource, query=query, - headers={'X-Frontend-Id': 6})['data']['mylist'] - - def _parse_owner(self, item): - owner = item.get('owner') or {} - if owner: - return { - 'uploader': owner.get('name'), - 'uploader_id': owner.get('id'), - } - return {} - - def _fetch_page(self, list_id, page): - page += 1 - items = self._call_api(list_id, 'page %d' % page, { - 'page': page, - 'pageSize': self._PAGE_SIZE, - })['items'] - for item in items: - video = item.get('video') or {} - video_id = video.get('id') - if not video_id: - continue - count = video.get('count') or {} - get_count = lambda x: int_or_none(count.get(x)) - info = { - '_type': 'url', - 'id': video_id, - 'title': video.get('title'), - 'url': 'https://www.nicovideo.jp/watch/' + video_id, - 'description': video.get('shortDescription'), - 'duration': int_or_none(video.get('duration')), - 'view_count': get_count('view'), - 'comment_count': get_count('comment'), - 'ie_key': NiconicoIE.ie_key(), - } - info.update(self._parse_owner(video)) - yield info + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0' + } def _real_extract(self, url): list_id = self._match_id(url) - mylist = self._call_api(list_id, 'list', { - 'pageSize': 1, - }) - entries = InAdvancePagedList( - functools.partial(self._fetch_page, list_id), - math.ceil(mylist['totalItemCount'] / self._PAGE_SIZE), - self._PAGE_SIZE) - result = self.playlist_result( - entries, list_id, mylist.get('name'), mylist.get('description')) - result.update(self._parse_owner(mylist)) - return result + + def get_page_data(pagenum, pagesize): + return self._download_json( + 'http://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, + query={'page': 1 + pagenum, 'pageSize': pagesize}, + headers=self._API_HEADERS).get('data').get('mylist') + + data = get_page_data(0, 1) + title = data.get('name') + description = data.get('description') + uploader = data.get('owner').get('name') + uploader_id = data.get('owner').get('id') + + def pagefunc(pagenum): + data = get_page_data(pagenum, 25) + return ({ + '_type': 'url', + 'url': 'http://www.nicovideo.jp/watch/' + item.get('watchId'), + } for item in data.get('items')) + + return { + '_type': 'playlist', + 'id': list_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'entries': OnDemandPagedList(pagefunc, 25), + } + + +class NicovideoSearchBaseIE(InfoExtractor): + _MAX_RESULTS = float('inf') + + def _entries(self, url, item_id, query=None, note='Downloading page %(page)s'): + query = query or {} + pages = [query['page']] if 'page' in query else itertools.count(1) + for page_num in pages: + query['page'] = str(page_num) + webpage = self._download_webpage(url, item_id, query=query, note=note % {'page': page_num}) + results = re.findall(r'(?<=data-video-id=)["\']?(?P<videoid>.+?)(?=["\'])', webpage) + for item in results: + yield self.url_result('http://www.nicovideo.jp/watch/%s' % item, 'Niconico', item) + if not results: + break + + def _get_n_results(self, query, n): + entries = self._entries(self._proto_relative_url('//www.nicovideo.jp/search/%s' % query), query) + if n < self._MAX_RESULTS: + entries = itertools.islice(entries, 0, n) + return self.playlist_result(entries, query, query) + + +class NicovideoSearchIE(NicovideoSearchBaseIE, SearchInfoExtractor): + IE_DESC = 'Nico video search' + IE_NAME = 'nicovideo:search' + _SEARCH_KEY = 'nicosearch' + + def _search_results(self, query): + return self._entries( + self._proto_relative_url('//www.nicovideo.jp/search/%s' % query), query) + + +class NicovideoSearchURLIE(NicovideoSearchBaseIE): + IE_NAME = '%s_url' % NicovideoSearchIE.IE_NAME + IE_DESC = 'Nico video search URLs' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/search/(?P<id>[^?#&]+)?' + _TESTS = [{ + 'url': 'http://www.nicovideo.jp/search/sm9', + 'info_dict': { + 'id': 'sm9', + 'title': 'sm9' + }, + 'playlist_mincount': 40, + }, { + 'url': 'https://www.nicovideo.jp/search/sm9?sort=h&order=d&end=2020-12-31&start=2020-01-01', + 'info_dict': { + 'id': 'sm9', + 'title': 'sm9' + }, + 'playlist_count': 31, + }] + + def _real_extract(self, url): + query = self._match_id(url) + return self.playlist_result(self._entries(url, query), query, query) + + +class NicovideoSearchDateIE(NicovideoSearchBaseIE, SearchInfoExtractor): + IE_DESC = 'Nico video search, newest first' + IE_NAME = '%s:date' % NicovideoSearchIE.IE_NAME + _SEARCH_KEY = 'nicosearchdate' + + _TESTS = [{ + 'url': 'nicosearchdateall:a', + 'info_dict': { + 'id': 'a', + 'title': 'a' + }, + 'playlist_mincount': 1610, + }] + + _START_DATE = datetime.date(2007, 1, 1) + _RESULTS_PER_PAGE = 32 + _MAX_PAGES = 50 + + def _entries(self, url, item_id, start_date=None, end_date=None): + start_date, end_date = start_date or self._START_DATE, end_date or datetime.datetime.now().date() + + # If the last page has a full page of videos, we need to break down the query interval further + last_page_len = len(list(self._get_entries_for_date( + url, item_id, start_date, end_date, self._MAX_PAGES, + note='Checking number of videos from {0} to {1}'.format(start_date, end_date)))) + if (last_page_len == self._RESULTS_PER_PAGE and start_date != end_date): + midpoint = start_date + ((end_date - start_date) // 2) + for entry in itertools.chain( + iter(self._entries(url, item_id, midpoint, end_date)), + iter(self._entries(url, item_id, start_date, midpoint))): + yield entry + else: + self.to_screen('{0}: Downloading results from {1} to {2}'.format(item_id, start_date, end_date)) + for entry in iter(self._get_entries_for_date( + url, item_id, start_date, end_date, note=' Downloading page %(page)s')): + yield entry + + def _get_entries_for_date(self, url, item_id, start_date, end_date=None, page_num=None, note=None): + query = { + 'start': compat_str(start_date), + 'end': compat_str(end_date or start_date), + 'sort': 'f', + 'order': 'd', + } + if page_num: + query['page'] = compat_str(page_num) + + for entry in iter(super(NicovideoSearchDateIE, self)._entries(url, item_id, query=query, note=note)): + yield entry + + +class NiconicoUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)/?(?:$|[#?])' + _TEST = { + 'url': 'https://www.nicovideo.jp/user/419948', + 'info_dict': { + 'id': '419948', + }, + 'playlist_mincount': 101, + } + _API_URL = "https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s" + _PAGE_SIZE = 100 + + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0' + } + + def _entries(self, list_id): + total_count = 1 + count = page_num = 0 + while count < total_count: + json_parsed = self._download_json( + self._API_URL % (list_id, self._PAGE_SIZE, page_num + 1), list_id, + headers=self._API_HEADERS, + note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else '')) + if not page_num: + total_count = int_or_none(json_parsed['data'].get('totalCount')) + for entry in json_parsed["data"]["items"]: + count += 1 + yield self.url_result('https://www.nicovideo.jp/watch/%s' % entry['id']) + page_num += 1 + + def _real_extract(self, url): + list_id = self._match_id(url) + return self.playlist_result(self._entries(list_id), list_id) From 92d73ef3936ed6de9770f613fddf2260731becc9 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 7 Dec 2021 23:30:30 +0000 Subject: [PATCH 43/70] [niconico] Implement heartbeat for download --- youtube_dl/downloader/__init__.py | 25 ++++++++---- youtube_dl/downloader/niconico.py | 66 +++++++++++++++++++++++++++++++ youtube_dl/extractor/niconico.py | 18 +++++++++ 3 files changed, 101 insertions(+), 8 deletions(-) create mode 100644 youtube_dl/downloader/niconico.py diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 2e485df9d..d8f2fa342 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -1,22 +1,31 @@ from __future__ import unicode_literals +from ..utils import ( + determine_protocol, +) + + +def get_suitable_downloader(info_dict, params={}): + info_dict['protocol'] = determine_protocol(info_dict) + info_copy = info_dict.copy() + return _get_suitable_downloader(info_copy, params) + + +# Some of these require get_suitable_downloader from .common import FileDownloader +from .dash import DashSegmentsFD from .f4m import F4mFD from .hls import HlsFD from .http import HttpFD from .rtmp import RtmpFD -from .dash import DashSegmentsFD from .rtsp import RtspFD from .ism import IsmFD +from .niconico import NiconicoDmcFD from .external import ( get_external_downloader, FFmpegFD, ) -from ..utils import ( - determine_protocol, -) - PROTOCOL_MAP = { 'rtmp': RtmpFD, 'm3u8_native': HlsFD, @@ -26,13 +35,12 @@ PROTOCOL_MAP = { 'f4m': F4mFD, 'http_dash_segments': DashSegmentsFD, 'ism': IsmFD, + 'niconico_dmc': NiconicoDmcFD, } -def get_suitable_downloader(info_dict, params={}): +def _get_suitable_downloader(info_dict, params={}): """Get the downloader class that can handle the info dict.""" - protocol = determine_protocol(info_dict) - info_dict['protocol'] = protocol # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): # return FFmpegFD @@ -43,6 +51,7 @@ def get_suitable_downloader(info_dict, params={}): if ed.can_download(info_dict): return ed + protocol = info_dict['protocol'] if protocol.startswith('m3u8') and info_dict.get('is_live'): return FFmpegFD diff --git a/youtube_dl/downloader/niconico.py b/youtube_dl/downloader/niconico.py new file mode 100644 index 000000000..6392c9989 --- /dev/null +++ b/youtube_dl/downloader/niconico.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +try: + import threading +except ImportError: + threading = None + +from .common import FileDownloader +from ..downloader import get_suitable_downloader +from ..extractor.niconico import NiconicoIE +from ..utils import sanitized_Request + + +class NiconicoDmcFD(FileDownloader): + """ Downloading niconico douga from DMC with heartbeat """ + + FD_NAME = 'niconico_dmc' + + def real_download(self, filename, info_dict): + self.to_screen('[%s] Downloading from DMC' % self.FD_NAME) + + ie = NiconicoIE(self.ydl) + info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict) + + fd = get_suitable_downloader(info_dict, params=self.params)(self.ydl, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + + if not threading: + self.to_screen('[%s] Threading for Heartbeat not available' % self.FD_NAME) + return fd.real_download(filename, info_dict) + + success = download_complete = False + timer = [None] + heartbeat_lock = threading.Lock() + heartbeat_url = heartbeat_info_dict['url'] + heartbeat_data = heartbeat_info_dict['data'].encode() + heartbeat_interval = heartbeat_info_dict.get('interval', 30) + + request = sanitized_Request(heartbeat_url, heartbeat_data) + + def heartbeat(): + try: + self.ydl.urlopen(request).read() + except Exception: + self.to_screen('[%s] Heartbeat failed' % self.FD_NAME) + + with heartbeat_lock: + if not download_complete: + timer[0] = threading.Timer(heartbeat_interval, heartbeat) + timer[0].start() + + heartbeat_info_dict['ping']() + self.to_screen('[%s] Heartbeat with %d second interval ...' % (self.FD_NAME, heartbeat_interval)) + try: + heartbeat() + if type(fd).__name__ == 'HlsFD': + info_dict.update(ie._extract_m3u8_formats(info_dict['url'], info_dict['id'])[0]) + success = fd.real_download(filename, info_dict) + finally: + if heartbeat_lock: + with heartbeat_lock: + timer[0].cancel() + download_complete = True + return success diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 756ad0e25..93f813968 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -160,6 +160,24 @@ class NiconicoIE(InfoExtractor): }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, + }, { + # DMC video with heartbeat + 'url': 'https://www.nicovideo.jp/watch/sm34815188', + 'md5': '9360c6e1f1519d7759e2fe8e1326ae83', + 'info_dict': { + 'id': 'sm34815188', + 'ext': 'mp4', + 'title': 'md5:aee93e9f3366db72f902f6cd5d389cb7', + 'description': 'md5:7b9149fc7a00ab053cafaf5c19662704', + 'thumbnail': r're:https?://.*', + 'uploader': 'md5:2762e18fa74dbb40aa1ad27c6291ee32', + 'uploader_id': '67449889', + 'upload_date': '20190322', + 'timestamp': int, # timestamp is unstable + 'duration': 1082.0, + 'view_count': int, + 'comment_count': int, + }, }] _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' From 6d4932f02347bb1d0228b20798435930022bf316 Mon Sep 17 00:00:00 2001 From: df <fieldhouse@gmx.net> Date: Sun, 18 Apr 2021 01:46:40 +0100 Subject: [PATCH 44/70] Try for timestamp, description from window.__INITIAL_DATA__ pages --- youtube_dl/extractor/bbc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 247d982ce..37d427a66 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1205,7 +1205,10 @@ class BBCIE(BBCCoUkIE): if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + for block in (try_get(resp, + (lambda x: x['data']['blocks'], + lambda x: x['data']['content']['model']['blocks'],), + list) or []): if block.get('type') != 'media': continue parse_media(block.get('model')) From 58babe9af79215bd6bdf07da0a8ebb1d3650e00b Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 30 Nov 2021 05:15:33 +0000 Subject: [PATCH 45/70] Support __INITIAL_DATA__ with stringified JSON Add test and fix test for bbcthreeConfig --- youtube_dl/extractor/bbc.py | 50 +++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 37d427a66..088af9823 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -12,6 +12,7 @@ from ..compat import ( compat_HTTPError, compat_parse_qs, compat_str, + compat_urllib_error, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -395,9 +396,17 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_mpd_formats( href, programme_id, mpd_id=format_id, fatal=False)) elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) + # TODO: let expected_status be passed into _extract_xxx_formats() instead + try: + fmts = self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + except ExtractorError as e: + if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + and e.exc_info[1].code in (403, 404)): + raise + fmts = [] + formats.extend(fmts) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) @@ -775,21 +784,33 @@ class BBCIE(BBCCoUkIE): 'timestamp': 1437785037, 'upload_date': '20150725', }, + }, { + # video with window.__INITIAL_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/av/world-europe-59468682', + 'info_dict': { + 'id': 'p0b71qth', + 'ext': 'mp4', + 'title': 'Why France is making this woman a national hero', + 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1638230731, + 'upload_date': '20211130', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, }, { + # bbcthreeConfig 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', 'info_dict': { 'id': 'p06556y7', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + 'title': 'Things Not To Say to people that live on council estates', + 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.", + 'duration': 360, + 'thumbnail': r're:https?://.+/.+\.jpg', }, - 'params': { - 'skip_download': True, - } }, { # window.__PRELOADED_STATE__ 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', @@ -1162,9 +1183,16 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) - initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, + 'quoted preload state', default=None) + if initial_data is None: + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage, + 'preload state', default={}) + else: + initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) + initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: def parse_media(media): if not media: From c820a284a23438f065171b7e222024d01893a95f Mon Sep 17 00:00:00 2001 From: Abdullah Ibn Fulan <ibnfulan@tutanota.de> Date: Tue, 17 Aug 2021 18:22:07 +0600 Subject: [PATCH 46/70] [extractor/audiomack] Updated URL regex, corrected invalid testcases, fixed bug Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/audiomack.py | 40 ++++++++++++++++--------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index cc7771354..638eb4041 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -14,7 +14,7 @@ from ..utils import ( class AudiomackIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:song/|(?=.+/song/))(?P<id>[\w/-]+)' IE_NAME = 'audiomack' _TESTS = [ # hosted on audiomack @@ -29,25 +29,27 @@ class AudiomackIE(InfoExtractor): } }, # audiomack wrapper around soundcloud song + # Needs new test URL. { 'add_ie': ['Soundcloud'], 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle', - 'info_dict': { - 'id': '258901379', - 'ext': 'mp3', - 'description': 'mamba day freestyle for the legend Kobe Bryant ', - 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', - 'uploader': 'ILOVEMAKONNEN', - 'upload_date': '20160414', - } + 'only_matching': True, + # 'info_dict': { + # 'id': '258901379', + # 'ext': 'mp3', + # 'description': 'mamba day freestyle for the legend Kobe Bryant ', + # 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', + # 'uploader': 'ILOVEMAKONNEN', + # 'upload_date': '20160414', + # } }, ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/song/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/song/', '/') # Request the extended version of the api for extra fields like artist and title api_response = self._download_json( @@ -79,7 +81,7 @@ class AudiomackAlbumIE(InfoExtractor): # Standard album playlist { 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape', - 'playlist_count': 15, + 'playlist_count': 11, 'info_dict': { 'id': '812251', @@ -95,24 +97,24 @@ class AudiomackAlbumIE(InfoExtractor): }, 'playlist': [{ 'info_dict': { - 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)', - 'id': '837577', + 'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)', + 'id': '837580', 'ext': 'mp3', 'uploader': 'Lil Herb a.k.a. G Herbo', } }], 'params': { - 'playliststart': 9, - 'playlistend': 9, + 'playliststart': 2, + 'playlistend': 2, } } ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/album/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/album/', '/') result = {'_type': 'playlist', 'entries': []} # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata # Therefore we don't know how many songs the album has and must infi-loop until failure @@ -134,7 +136,7 @@ class AudiomackAlbumIE(InfoExtractor): # Pull out the album metadata and add to result (if it exists) for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]: if apikey in api_response and resultkey not in result: - result[resultkey] = api_response[apikey] + result[resultkey] = compat_str(api_response[apikey]) song_id = url_basename(api_response['url']).rpartition('.')[0] result['entries'].append({ 'id': compat_str(api_response.get('id', song_id)), From 16a3fe2ba6b4c86e60bca930253c81c8efdd676b Mon Sep 17 00:00:00 2001 From: Abdullah Ibn Fulan <54185653+abdullah-if@users.noreply.github.com> Date: Tue, 17 Aug 2021 19:56:39 +0000 Subject: [PATCH 47/70] Updated Album URL regex Mistakenly forgot to edit a line in last commit. Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/audiomack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 638eb4041..4d1fbad1f 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -75,7 +75,7 @@ class AudiomackIE(InfoExtractor): class AudiomackAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P<id>[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:album/|(?=.+/album/))(?P<id>[\w/-]+)' IE_NAME = 'audiomack:album' _TESTS = [ # Standard album playlist From ddc080a562cce984ac4a86969f511b1ae59421bf Mon Sep 17 00:00:00 2001 From: df <fieldhouse@gmx.net> Date: Mon, 18 Oct 2021 15:54:26 +0100 Subject: [PATCH 48/70] Add ArteTVCategoryIE to support category playlists --- youtube_dl/extractor/arte.py | 47 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 48 insertions(+) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 03abdbfaf..5bfe57b10 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -12,6 +12,7 @@ from ..utils import ( ExtractorError, int_or_none, qualities, + strip_or_none, try_get, unified_strdate, url_or_none, @@ -252,3 +253,49 @@ class ArteTVPlaylistIE(ArteTVBaseIE): title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') return self.playlist_result(entries, playlist_id, title, description) + + +class ArteTVCategoryIE(ArteTVBaseIE): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/politics-and-society/', + 'info_dict': { + 'id': 'politics-and-society', + 'title': 'Politics and society', + 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', + }, + 'playlist_mincount': 13, + }, + ] + + @classmethod + def suitable(cls, url): + return ( + not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) + and super(ArteTVCategoryIE, cls).suitable(url)) + + def _real_extract(self, url): + lang, playlist_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, playlist_id) + + items = [] + for video in re.finditer( + r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang, + webpage): + video = video.group('url') + if video == url: + continue + if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): + items.append(video) + + if items: + title = (self._og_search_title(webpage, default=None) + or self._html_search_regex(r'<title\b[^>]*>([^<]+)', default=None)) + title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) + + result = self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title) + if result: + description = self._og_search_description(webpage, default=None) + if description: + result['description'] = description + return result diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e70daf2b1..50b7cb4a0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -71,6 +71,7 @@ from .arte import ( ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, + ArteTVCategoryIE, ) from .arnes import ArnesIE from .asiancrush import ( From 734dfbb4e3ad4ee4d98609dc902ac864b94033a4 Mon Sep 17 00:00:00 2001 From: Seonghyeon Cho Date: Wed, 13 Oct 2021 20:27:40 +0900 Subject: [PATCH 49/70] Remove redundant assigning `format_id` --- youtube_dl/extractor/uol.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py index 628adf219..59f8e5dc3 100644 --- a/youtube_dl/extractor/uol.py +++ b/youtube_dl/extractor/uol.py @@ -95,7 +95,6 @@ class UOLIE(InfoExtractor): if v: query[k] = v f_url = update_url_query(f_url, query) - format_id = format_id if format_id == 'HLS': m3u8_formats = self._extract_m3u8_formats( f_url, media_id, 'mp4', 'm3u8_native', From 47b0c8697a39bbd64d5b922f81ad74ee4d2a3136 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 7 Feb 2022 13:28:21 +0000 Subject: [PATCH 50/70] [ARD] Back-port subtitle extraction from yt-dlp PR 2409 Authored by: fstirlitz Fixes #30543 Closes #17766 (thanks ngdio) --- youtube_dl/extractor/ard.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index d45a9fe52..a5b1f54d5 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -332,9 +332,24 @@ class ARDIE(InfoExtractor): formats.append(f) self._sort_formats(formats) + _SUB_FORMATS = ( + ('./dataTimedText', 'ttml'), + ('./dataTimedTextNoOffset', 'ttml'), + ('./dataTimedTextVtt', 'vtt'), + ) + + subtitles = {} + for subsel, subext in _SUB_FORMATS: + for node in video_node.findall(subsel): + subtitles.setdefault('de', []).append({ + 'url': node.attrib['url'], + 'ext': subext, + }) + return { 'id': xpath_text(video_node, './videoId', default=display_id), 'formats': formats, + 'subtitles': subtitles, 'display_id': display_id, 'title': video_node.find('./title').text, 'duration': parse_duration(video_node.find('./duration').text), From 825d3426c56aabfc91aea139f2e6e0589f8096bc Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 9 Feb 2022 02:40:34 +0000 Subject: [PATCH 51/70] [Nuvid] Use site JSON for video details (#29332) Back-port yt-dlp PR 1022 onto PR #17890 and update Video details aren't in the original HTML now but populated by async JS Co-authored by: u-spec-png Co-authored by: vidaritos --- youtube_dl/extractor/nuvid.py | 120 +++++++++++++++++++++++----------- 1 file changed, 81 insertions(+), 39 deletions(-) diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py index ab6bfcd7f..f6c94dd77 100644 --- a/youtube_dl/extractor/nuvid.py +++ b/youtube_dl/extractor/nuvid.py @@ -1,71 +1,113 @@ +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( parse_duration, + int_or_none, + try_get, + url_or_none, ) +import re + class NuvidIE(InfoExtractor): _VALID_URL = r'https?://(?:www|m)\.nuvid\.com/video/(?P[0-9]+)' - _TEST = { - 'url': 'http://m.nuvid.com/video/1310741/', - 'md5': 'eab207b7ac4fccfb4e23c86201f11277', + _TESTS = [{ + 'url': 'https://www.nuvid.com/video/6513023/italian-babe', + 'md5': '772d2f8288f3d3c5c45f7a41761c7844', 'info_dict': { - 'id': '1310741', + 'id': '6513023', 'ext': 'mp4', - 'title': 'Horny babes show their awesome bodeis and', - 'duration': 129, + 'title': 'italian babe', + 'format_id': '360p', + 'duration': 321.0, 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', + 'thumbnails': list, } - } + }, { + 'url': 'https://m.nuvid.com/video/6523263', + 'md5': 'ebd22ce8e47e1d9a4d0756a15c67da52', + 'info_dict': { + 'id': '6523263', + 'ext': 'mp4', + 'title': 'Slut brunette college student anal dorm', + 'format_id': '720p', + 'duration': 421.0, + 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', + 'thumbnails': list, + } + }, { + 'url': 'http://m.nuvid.com/video/6415801/', + 'md5': '638d5ececb138d5753593f751ae3f697', + 'info_dict': { + 'id': '6415801', + 'ext': 'mp4', + 'title': 'My best friend wanted to fuck my wife for a long time', + 'format_id': '720p', + 'duration': 1882, + 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', + 'thumbnails': list, + } + }] def _real_extract(self, url): video_id = self._match_id(url) - page_url = 'http://m.nuvid.com/video/%s' % video_id + qualities = { + 'lq': '360p', + 'hq': '720p', + } + + json_url = 'https://www.nuvid.com/player_config_json/?vid={video_id}&aid=0&domain_id=0&embed=0&check_speed=0'.format(**locals()) + video_data = self._download_json( + json_url, video_id, headers={ + 'Accept': 'application/json, text/javascript, */*; q = 0.01', + 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', + }) or {} + + # nice to have, not required webpage = self._download_webpage( - page_url, video_id, 'Downloading video page') - # When dwnld_speed exists and has a value larger than the MP4 file's - # bitrate, Nuvid returns the MP4 URL - # It's unit is 100bytes/millisecond, see mobile-nuvid-min.js for the algorithm - self._set_cookie('nuvid.com', 'dwnld_speed', '10.0') - mp4_webpage = self._download_webpage( - page_url, video_id, 'Downloading video page for MP4 format') + 'http://m.nuvid.com/video/%s' % (video_id, ), + video_id, 'Downloading video page', fatal=False) or '' + + title = ( + try_get(video_data, lambda x: x['title'], compat_str) + or self._html_search_regex( + (r''']*?\btitle\s*=\s*(?P"|'|\b)(?P[^"]+)(?P=q)\s*>''', + r'''<div\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)thumb-holder video(?P=q)>\s*<h5\b[^>]*>(?P<title>[^<]+)</h5''', + r'''<span\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)title_thumb(?P=q)>(?P<title>[^<]+)</span'''), + webpage, 'title', group='title')).strip() - html5_video_re = r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', - video_url = self._html_search_regex(html5_video_re, webpage, video_id) - mp4_video_url = self._html_search_regex(html5_video_re, mp4_webpage, video_id) formats = [{ - 'url': video_url, - }] - if mp4_video_url != video_url: - formats.append({ - 'url': mp4_video_url, - }) + 'url': source, + 'format_id': qualities.get(quality), + 'height': int_or_none(qualities.get(quality)[:-1]), + } for quality, source in video_data.get('files').items() if source] - title = self._html_search_regex( - [r'<span title="([^"]+)">', - r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>', - r'<span[^>]+class="title_thumb">([^<]+)</span>'], webpage, 'title').strip() + self._check_formats(formats, video_id) + self._sort_formats(formats) + + duration = parse_duration(video_data.get('duration') or video_data.get('duration_format')) thumbnails = [ - { - 'url': thumb_url, - } for thumb_url in re.findall(r'<img src="([^"]+)" alt="" />', webpage) + {'url': thumb_url, } + for thumb_url in ( + url_or_none(src) for src in re.findall( + r'<div\s+class\s*=\s*"video-tmb-wrap"\s*>\s*<img\s+src\s*=\s*"([^"]+)"\s*/>', + webpage)) ] - thumbnail = thumbnails[0]['url'] if thumbnails else None - duration = parse_duration(self._html_search_regex( - [r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', - r'<span[^>]+class="view_time">([^<]+)</span>'], webpage, 'duration', fatal=False)) return { 'id': video_id, + 'formats': formats, 'title': title, + 'thumbnail': url_or_none(video_data.get('poster')), 'thumbnails': thumbnails, - 'thumbnail': thumbnail, 'duration': duration, 'age_limit': 18, - 'formats': formats, } From 266b6ef18520f8de60fa143e154e4b12be12afb7 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 9 Feb 2022 21:21:59 +0000 Subject: [PATCH 52/70] [BBC] Also allow PID with leading 'l' (live?) --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 088af9823..378b52f4f 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -40,7 +40,7 @@ from ..utils import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' + _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ From 8ff961d10faed848009f9e2ec03fa390b486694d Mon Sep 17 00:00:00 2001 From: kikuyan <kikuyan@users.noreply.github.com> Date: Thu, 23 Dec 2021 11:40:45 +0900 Subject: [PATCH 53/70] [extractor/videa] fix extraction in Py2 Fixes #30416 --- youtube_dl/extractor/videa.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index ab2c15cde..bdb95891d 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -91,7 +91,7 @@ class VideaIE(InfoExtractor): k = S[(S[i] + S[j]) % 256] res += compat_struct_pack('B', k ^ compat_ord(cipher_text[m])) - return res.decode() + return res.decode('utf-8') def _real_extract(self, url): video_id = self._match_id(url) @@ -121,7 +121,7 @@ class VideaIE(InfoExtractor): compat_b64decode(b64_info), key), video_id) video = xpath_element(info, './video', 'video') - if not video: + if video is None: raise ExtractorError(xpath_element( info, './error', fatal=True), expected=True) sources = xpath_element( From 74f8cc48afa59e1a125f939c060b21654d29789c Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 9 Feb 2022 04:37:28 +0000 Subject: [PATCH 54/70] [extractor/videa] Back-port from yt-dlp PRs 463+1028 Authored by: nyuszika7h --- youtube_dl/extractor/videa.py | 53 ++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index bdb95891d..4589e78a1 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -12,6 +12,7 @@ from ..utils import ( mimetype2ext, parse_codecs, update_url_query, + urljoin, xpath_element, xpath_text, ) @@ -19,6 +20,7 @@ from ..compat import ( compat_b64decode, compat_ord, compat_struct_pack, + compat_urlparse, ) @@ -45,10 +47,24 @@ class VideaIE(InfoExtractor): }, }, { 'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', - 'only_matching': True, + 'md5': 'd57ccd8812c7fd491d33b1eab8c99975', + 'info_dict': { + 'id': 'jAHDWfWSJH5XuFhH', + 'ext': 'mp4', + 'title': 'Supercars előzés', + 'thumbnail': r're:^https?://.*', + 'duration': 64, + }, }, { 'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ', - 'only_matching': True, + 'md5': '97a7af41faeaffd9f1fc864a7c7e7603', + 'info_dict': { + 'id': '8YfIAjxwWGwT8HVQ', + 'ext': 'mp4', + 'title': 'Az őrült kígyász 285 kígyót enged szabadon', + 'thumbnail': r're:^https?://.*', + 'duration': 21, + }, }, { 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', 'only_matching': True, @@ -95,9 +111,16 @@ class VideaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - query = {'v': video_id} - player_page = self._download_webpage( - 'https://videa.hu/player', video_id, query=query) + video_page = self._download_webpage(url, video_id) + + if 'videa.hu/player' in url: + player_url = url + player_page = video_page + else: + player_url = self._search_regex( + r'<iframe.*?src="(/player\?[^"]+)"', video_page, 'player url') + player_url = urljoin(url, player_url) + player_page = self._download_webpage(player_url, video_id) nonce = self._search_regex( r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') @@ -107,6 +130,7 @@ class VideaIE(InfoExtractor): for i in range(0, 32): result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)] + query = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query) random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) query['_s'] = random_seed query['_t'] = result[:16] @@ -127,7 +151,7 @@ class VideaIE(InfoExtractor): sources = xpath_element( info, './video_sources', 'sources', fatal=True) hash_values = xpath_element( - info, './hash_values', 'hash values', fatal=True) + info, './hash_values', 'hash values', fatal=False) title = xpath_text(video, './title', fatal=True) @@ -136,15 +160,16 @@ class VideaIE(InfoExtractor): source_url = source.text source_name = source.get('name') source_exp = source.get('exp') - if not (source_url and source_name and source_exp): + if not (source_url and source_name): continue - hash_value = xpath_text(hash_values, 'hash_value_' + source_name) - if not hash_value: - continue - source_url = update_url_query(source_url, { - 'md5': hash_value, - 'expires': source_exp, - }) + hash_value = ( + xpath_text(hash_values, 'hash_value_' + source_name) + if hash_values is not None else None) + if hash_value and source_exp: + source_url = update_url_query(source_url, { + 'md5': hash_value, + 'expires': source_exp, + }) f = parse_codecs(source.get('codecs')) f.update({ 'url': self._proto_relative_url(source_url), From 29f7bfc4d7a80cecd67c19c25134481fbba6e175 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Tue, 11 Jan 2022 17:56:18 +0100 Subject: [PATCH 55/70] [streamcz] cherry-pick from yt-dlp Cherry-picked-from: 7d449fff5346 ("[streamcz] Fix extractor (#1616)") --- youtube_dl/extractor/streamcz.py | 157 ++++++++++++++++--------------- 1 file changed, 80 insertions(+), 77 deletions(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 58e0b4c80..0191c77de 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -1,105 +1,108 @@ # coding: utf-8 -from __future__ import unicode_literals - -import hashlib -import time +import json from .common import InfoExtractor from ..utils import ( + float_or_none, int_or_none, - sanitized_Request, + parse_codecs, + traverse_obj, + urljoin, ) -def _get_api_key(api_path): - if api_path.endswith('?'): - api_path = api_path[:-1] - - api_key = 'fb5f58a820353bd7095de526253c14fd' - a = '{0:}{1:}{2:}'.format(api_key, api_path, int(round(time.time() / 24 / 3600))) - return hashlib.md5(a.encode('ascii')).hexdigest() - - class StreamCZIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)' - _API_URL = 'http://www.stream.cz/API' - + _VALID_URL = r'https?://(?:www\.)?(?:stream|televizeseznam)\.cz/[^?#]+/(?P<display_id>[^?#]+)-(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', - 'md5': '934bb6a6d220d99c010783c9719960d5', + 'url': 'https://www.televizeseznam.cz/video/lajna/buh-57953890', + 'md5': '40c41ade1464a390a0b447e333df4239', 'info_dict': { - 'id': '765767', + 'id': '57953890', 'ext': 'mp4', - 'title': 'Peklo na talíři: Éčka pro děti', - 'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE', - 'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100', - 'duration': 256, - }, + 'title': 'Bůh', + 'display_id': 'buh', + 'description': 'md5:8f5f09b9b7bc67df910486cdd88f7165', + } }, { - 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', - 'md5': '849a88c1e1ca47d41403c2ba5e59e261', + 'url': 'https://www.stream.cz/tajemno/znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili-64147267', + 'md5': '3ee4d0be040e8f4a543e67e509d55e3f', 'info_dict': { - 'id': '10002447', + 'id': '64147267', 'ext': 'mp4', - 'title': 'Kancelář Blaník: Tři roky pro Mazánka', - 'description': 'md5:3862a00ba7bf0b3e44806b544032c859', - 'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000', - 'duration': 368, - }, + 'title': 'Zničehonic jim skrz střechu prolítnul záhadný předmět. Badatelé vše objasnili', + 'display_id': 'znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili', + 'description': 'md5:1dcb5e010eb697dedc5942f76c5b3744', + } }] + def _extract_formats(self, spl_url, video): + for ext, pref, streams in ( + ('ts', -1, traverse_obj(video, ('http_stream', 'qualities'))), + ('mp4', 1, video.get('mp4'))): + for format_id, stream in streams.items(): + if not stream.get('url'): + continue + yield { + 'format_id': f'{format_id}-{ext}', + 'ext': ext, + 'source_preference': pref, + 'url': urljoin(spl_url, stream['url']), + 'tbr': float_or_none(stream.get('bandwidth'), scale=1000), + 'duration': float_or_none(stream.get('duration'), scale=1000), + 'width': traverse_obj(stream, ('resolution', 0)), + 'height': traverse_obj(stream, ('resolution', 1)) or int_or_none(format_id.replace('p', '')), + **parse_codecs(stream.get('codec')), + } + def _real_extract(self, url): - video_id = self._match_id(url) - api_path = '/episode/%s' % video_id + display_id, video_id = self._match_valid_url(url).groups() - req = sanitized_Request(self._API_URL + api_path) - req.add_header('Api-Password', _get_api_key(api_path)) - data = self._download_json(req, video_id) + data = self._download_json( + 'https://www.televizeseznam.cz/api/graphql', video_id, 'Downloading GraphQL result', + data=json.dumps({ + 'variables': {'urlName': video_id}, + 'query': ''' + query LoadEpisode($urlName : String){ episode(urlName: $urlName){ ...VideoDetailFragmentOnEpisode } } + fragment VideoDetailFragmentOnEpisode on Episode { + id + spl + urlName + name + perex + duration + views + }''' + }).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=UTF-8'} + )['data']['episode'] - formats = [] - for quality, video in enumerate(data['video_qualities']): - for f in video['formats']: - typ = f['type'].partition('/')[2] - qlabel = video.get('quality_label') - formats.append({ - 'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ, - 'format_id': '%s-%s' % (typ, f['quality']), - 'url': f['source'], - 'height': int_or_none(f['quality'].rstrip('p')), - 'quality': quality, - }) - self._sort_formats(formats) - - image = data.get('image') - if image: - thumbnail = self._proto_relative_url( - image.replace('{width}', '1240').replace('{height}', '697'), - scheme='http:', - ) - else: - thumbnail = None - - stream = data.get('_embedded', {}).get('stream:show', {}).get('name') - if stream: - title = '%s: %s' % (stream, data['name']) - else: - title = data['name'] + spl_url = data['spl'] + 'spl2,3' + metadata = self._download_json(spl_url, video_id, 'Downloading playlist') + if 'Location' in metadata and 'data' not in metadata: + spl_url = metadata['Location'] + metadata = self._download_json(spl_url, video_id, 'Downloading redirected playlist') + video = metadata['data'] subtitles = {} - srt_url = data.get('subtitles_srt') - if srt_url: - subtitles['cs'] = [{ - 'ext': 'srt', - 'url': srt_url, - }] + for subs in video.get('subtitles', {}).values(): + if not subs.get('language'): + continue + for ext, sub_url in subs.get('urls').items(): + subtitles.setdefault(subs['language'], []).append({ + 'ext': ext, + 'url': urljoin(spl_url, sub_url) + }) + + formats = list(self._extract_formats(spl_url, video)) + self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'description': data.get('web_site_text'), - 'duration': int_or_none(data.get('duration')), + 'display_id': display_id, + 'title': data.get('name'), + 'description': data.get('perex'), + 'duration': float_or_none(data.get('duration')), 'view_count': int_or_none(data.get('views')), + 'formats': formats, 'subtitles': subtitles, } From 8088ce036ac4ce282f8f864c6b5f4f3987647221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Sat, 12 Feb 2022 11:55:13 +0100 Subject: [PATCH 56/70] revert: use _match_valid_url function --- youtube_dl/extractor/streamcz.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 0191c77de..998342e93 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -1,5 +1,6 @@ # coding: utf-8 import json +import re from .common import InfoExtractor from ..utils import ( @@ -55,7 +56,7 @@ class StreamCZIE(InfoExtractor): } def _real_extract(self, url): - display_id, video_id = self._match_valid_url(url).groups() + display_id, video_id = re.match(self._VALID_URL, url).groups() data = self._download_json( 'https://www.televizeseznam.cz/api/graphql', video_id, 'Downloading GraphQL result', From b1297308fb7b423a60c3a28c74ac014d7b385a2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Sat, 12 Feb 2022 12:28:30 +0100 Subject: [PATCH 57/70] avoid traverse_obj function --- youtube_dl/extractor/streamcz.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 998342e93..fbdc44505 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -7,7 +7,6 @@ from ..utils import ( float_or_none, int_or_none, parse_codecs, - traverse_obj, urljoin, ) @@ -38,7 +37,7 @@ class StreamCZIE(InfoExtractor): def _extract_formats(self, spl_url, video): for ext, pref, streams in ( - ('ts', -1, traverse_obj(video, ('http_stream', 'qualities'))), + ('ts', -1, video.get('http_stream', {}).get('qualities', {})), ('mp4', 1, video.get('mp4'))): for format_id, stream in streams.items(): if not stream.get('url'): @@ -50,8 +49,8 @@ class StreamCZIE(InfoExtractor): 'url': urljoin(spl_url, stream['url']), 'tbr': float_or_none(stream.get('bandwidth'), scale=1000), 'duration': float_or_none(stream.get('duration'), scale=1000), - 'width': traverse_obj(stream, ('resolution', 0)), - 'height': traverse_obj(stream, ('resolution', 1)) or int_or_none(format_id.replace('p', '')), + 'width': stream.get('resolution', 2 * [0])[0] or None, + 'height': stream.get('resolution', 2 * [0])[1] or int_or_none(format_id.replace('p', '')), **parse_codecs(stream.get('codec')), } From d02064218be76eba6350a13ccbbc473b1b439570 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Sat, 12 Feb 2022 12:30:29 +0100 Subject: [PATCH 58/70] do not use f-strings --- youtube_dl/extractor/streamcz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index fbdc44505..d1736c023 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -43,7 +43,7 @@ class StreamCZIE(InfoExtractor): if not stream.get('url'): continue yield { - 'format_id': f'{format_id}-{ext}', + 'format_id': '{}-{}'.format(format_id, ext), 'ext': ext, 'source_preference': pref, 'url': urljoin(spl_url, stream['url']), From d8adca1b664fceb07f2b28b55c7e1855407296ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Sat, 12 Feb 2022 13:13:20 +0100 Subject: [PATCH 59/70] [streamcz] test fixes and one additional test --- youtube_dl/extractor/streamcz.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index d1736c023..60e770448 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -22,6 +22,20 @@ class StreamCZIE(InfoExtractor): 'title': 'Bůh', 'display_id': 'buh', 'description': 'md5:8f5f09b9b7bc67df910486cdd88f7165', + 'duration': 1369.6, + 'view_count': int, + } + }, { + 'url': 'https://www.stream.cz/kdo-to-mluvi/kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna-64087937', + 'md5': '41fd358000086a1ccdb068c77809b158', + 'info_dict': { + 'id': '64087937', + 'ext': 'mp4', + 'title': 'Kdo to mluví? Velké odhalení přináší nový pořad už od 25. srpna', + 'display_id': 'kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna', + 'description': 'md5:97a811000a6460266029d6c1c2ebcd59', + 'duration': 50.2, + 'view_count': int, } }, { 'url': 'https://www.stream.cz/tajemno/znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili-64147267', @@ -31,7 +45,9 @@ class StreamCZIE(InfoExtractor): 'ext': 'mp4', 'title': 'Zničehonic jim skrz střechu prolítnul záhadný předmět. Badatelé vše objasnili', 'display_id': 'znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili', - 'description': 'md5:1dcb5e010eb697dedc5942f76c5b3744', + 'description': 'md5:4b8ada6718d34bb011c4e04ca4bc19bf', + 'duration': 442.84, + 'view_count': int, } }] From 85bf26c1d01f94b83476703e5c70022f01164ccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Sat, 12 Feb 2022 15:02:08 +0100 Subject: [PATCH 60/70] resolve problem with unpacking operator for <py3.5 --- youtube_dl/extractor/streamcz.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 60e770448..179bdcaba 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( float_or_none, int_or_none, + merge_dicts, parse_codecs, urljoin, ) @@ -58,7 +59,7 @@ class StreamCZIE(InfoExtractor): for format_id, stream in streams.items(): if not stream.get('url'): continue - yield { + yield merge_dicts({ 'format_id': '{}-{}'.format(format_id, ext), 'ext': ext, 'source_preference': pref, @@ -67,8 +68,7 @@ class StreamCZIE(InfoExtractor): 'duration': float_or_none(stream.get('duration'), scale=1000), 'width': stream.get('resolution', 2 * [0])[0] or None, 'height': stream.get('resolution', 2 * [0])[1] or int_or_none(format_id.replace('p', '')), - **parse_codecs(stream.get('codec')), - } + }, parse_codecs(stream.get('codec'))) def _real_extract(self, url): display_id, video_id = re.match(self._VALID_URL, url).groups() From bf23bc0489cf304b2a8ab756f2f63b2cfa5586fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Sat, 12 Feb 2022 15:27:10 +0100 Subject: [PATCH 61/70] add missing __future__ import unicode_literals --- youtube_dl/extractor/streamcz.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 179bdcaba..060ba32e0 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -1,4 +1,6 @@ # coding: utf-8 +from __future__ import unicode_literals + import json import re From 34722270741fb9c06f978861c1e5f503291070d8 Mon Sep 17 00:00:00 2001 From: Vladimir Stavrinov <9163352+vstavrinov@users.noreply.github.com> Date: Mon, 14 Feb 2022 20:54:31 +0300 Subject: [PATCH 62/70] [rutv] fix vbr for empty string value (#30623) * [rutv] use str_to_int() (thx dirkf) --- youtube_dl/extractor/rutv.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index d2713c19a..05f319396 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -6,7 +6,8 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, - int_or_none + int_or_none, + str_to_int ) @@ -179,7 +180,7 @@ class RUTVIE(InfoExtractor): 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', 'rtmp_live': True, 'ext': 'flv', - 'vbr': int(quality), + 'vbr': str_to_int(quality), 'preference': preference, } elif transport == 'm3u8': From 782bfd26dbebea60e35f58ab18e218bedbecb782 Mon Sep 17 00:00:00 2001 From: "Lesmiscore (Naoya Ozaki)" <nao20010128@gmail.com> Date: Thu, 24 Feb 2022 22:34:32 +0900 Subject: [PATCH 63/70] [bigo] add support for bigo.tv (#30635) * [bigo] add support for bigo.tv * [bigo] prepend "Bigo says" * title fallback * add error for invalid json data --- youtube_dl/extractor/bigo.py | 59 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 60 insertions(+) create mode 100644 youtube_dl/extractor/bigo.py diff --git a/youtube_dl/extractor/bigo.py b/youtube_dl/extractor/bigo.py new file mode 100644 index 000000000..ddf76ac55 --- /dev/null +++ b/youtube_dl/extractor/bigo.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, urlencode_postdata + + +class BigoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bigo\.tv/(?:[a-z]{2,}/)?(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'https://www.bigo.tv/ja/221338632', + 'info_dict': { + 'id': '6576287577575737440', + 'title': '土よ〜💁‍♂️ 休憩室/REST room', + 'thumbnail': r're:https?://.+', + 'uploader': '✨Shin💫', + 'uploader_id': '221338632', + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://www.bigo.tv/th/Tarlerm1304', + 'only_matching': True, + }, { + 'url': 'https://bigo.tv/115976881', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + info_raw = self._download_json( + 'https://bigo.tv/studio/getInternalStudioInfo', + user_id, data=urlencode_postdata({'siteId': user_id})) + + if not isinstance(info_raw, dict): + raise ExtractorError('Received invalid JSON data') + if info_raw.get('code'): + raise ExtractorError( + 'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True) + info = info_raw.get('data') or {} + + if not info.get('alive'): + raise ExtractorError('This user is offline.', expected=True) + + return { + 'id': info.get('roomId') or user_id, + 'title': info.get('roomTopic') or info.get('nick_name') or user_id, + 'formats': [{ + 'url': info.get('hls_src'), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'thumbnail': info.get('snapshot'), + 'uploader': info.get('nick_name'), + 'uploader_id': user_id, + 'is_live': True, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 50b7cb4a0..c73c4cd6c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -115,6 +115,7 @@ from .bfmtv import ( ) from .bibeltv import BibelTVIE from .bigflix import BigflixIE +from .bigo import BigoIE from .bild import BildIE from .bilibili import ( BiliBiliIE, From 923292ba643bf2a5c1fade797bd87a0de4f58d25 Mon Sep 17 00:00:00 2001 From: marieell <marieell@tuta.io> Date: Thu, 10 Feb 2022 10:36:24 +0100 Subject: [PATCH 64/70] [aliexpress] Fix test case --- youtube_dl/extractor/aliexpress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py index 6f241e683..9722fe9ac 100644 --- a/youtube_dl/extractor/aliexpress.py +++ b/youtube_dl/extractor/aliexpress.py @@ -18,7 +18,7 @@ class AliExpressLiveIE(InfoExtractor): 'id': '2800002704436634', 'ext': 'mp4', 'title': 'CASIMA7.22', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'uploader': 'CASIMA Official Store', 'timestamp': 1500717600, 'upload_date': '20170722', From 1f13ccfd7fcafbfd79ddd652967e02f9eda7ce79 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 24 Feb 2022 18:26:58 +0000 Subject: [PATCH 65/70] Fixed groups() call on potentially empty regex search object (#30676) * Fixed groups() call on potentially empty regex search object. - https://github.com/ytdl-org/youtube-dl/issues/30521 * minimising lines changed Co-authored-by: yayorbitgum <50963144+yayorbitgum@users.noreply.github.com> --- youtube_dl/extractor/myspass.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index db7ebc94c..f540c52ee 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -35,7 +35,9 @@ class MySpassIE(InfoExtractor): title = xpath_text(metadata, 'title', fatal=True) video_url = xpath_text(metadata, 'url_flv', 'download url', True) video_id_int = int(video_id) - for group in re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url).groups(): + + grps = re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url) + for group in grps.groups() if grps else []: group_int = int(group) if group_int > video_id_int: video_url = video_url.replace( From c4d1738316db45e03e0625650b3550334b66ab7f Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 24 Feb 2022 09:16:16 +0000 Subject: [PATCH 66/70] [CPAC] Add extractor for Canadian Parliament CPACIE: single episode CPACPlaylistIE: playlists and searches --- youtube_dl/extractor/cpac.py | 148 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 4 + 2 files changed, 152 insertions(+) create mode 100644 youtube_dl/extractor/cpac.py diff --git a/youtube_dl/extractor/cpac.py b/youtube_dl/extractor/cpac.py new file mode 100644 index 000000000..22741152c --- /dev/null +++ b/youtube_dl/extractor/cpac.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + urljoin, +) + +# compat_range +try: + if callable(xrange): + range = xrange +except (NameError, TypeError): + pass + + +class CPACIE(InfoExtractor): + IE_NAME = 'cpac' + _VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?P<fr>l-)?episode\?id=(?P<id>[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})' + _TEST = { + # 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909', + 'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'md5': 'e46ad699caafd7aa6024279f2614e8fa', + 'info_dict': { + 'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'ext': 'mp4', + 'upload_date': '20220215', + 'title': 'News Conference to Celebrate National Kindness Week – February 15, 2022', + 'description': 'md5:466a206abd21f3a6f776cdef290c23fb', + 'timestamp': 1644901200, + }, + 'params': { + 'format': 'bestvideo', + 'hls_prefer_native': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if '/l-episode?' in url else 'en' + + content = self._download_json( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id, + video_id) + video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], compat_str) + formats = [] + if video_url: + content = content['page'] + title = str_or_none(content['details']['title_%s_t' % (url_lang, )]) + formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4') + for fmt in formats: + # prefer language to match URL + fmt_lang = fmt.get('language') + if fmt_lang == url_lang: + fmt['language_preference'] = 10 + elif not fmt_lang: + fmt['language_preference'] = -1 + else: + fmt['language_preference'] = -10 + + self._sort_formats(formats) + + category = str_or_none(content['details']['category_%s_t' % (url_lang, )]) + + def is_live(v_type): + return (v_type == 'live') if v_type is not None else None + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))), + 'timestamp': unified_timestamp(content['details'].get('liveDateTime')), + 'category': [category] if category else None, + 'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))), + 'is_live': is_live(content['details'].get('type')), + } + + +class CPACPlaylistIE(InfoExtractor): + IE_NAME = 'cpac:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?P<fr>emission|rechercher))\?(?:[^&]+&)*?(?P<id>(?:id=\d+|programId=\d+|key=[^&]+))' + + _TESTS = [{ + 'url': 'https://www.cpac.ca/program?id=6', + 'info_dict': { + 'id': 'id=6', + 'title': 'Headline Politics', + 'description': 'Watch CPAC’s signature long-form coverage of the day’s pressing political events as they unfold.', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc', + 'info_dict': { + 'id': 'key=hudson', + 'title': 'hudson', + }, + 'playlist_count': 22, + }, { + 'url': 'https://www.cpac.ca/search?programId=50', + 'info_dict': { + 'id': 'programId=50', + 'title': '50', + }, + 'playlist_count': 9, + }, { + 'url': 'https://www.cpac.ca/emission?id=6', + 'only_matching': True, + }, { + 'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en' + pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult') + api_url = ( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/%s/index.xml&crafterSite=cpacca&%s' + % (pl_type, video_id, )) + content = self._download_json(api_url, video_id) + entries = [] + total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1) + for page in range(1, total_pages + 1): + if page > 1: + api_url = update_url_query(api_url, {'page': '%d' % (page, ), }) + content = self._download_json( + api_url, video_id, + note='Downloading continuation - %d' % (page, ), + fatal=False) + + for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []: + episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )])) + if episode_url: + entries.append(episode_url) + + return self.playlist_result( + (self.url_result(entry) for entry in entries), + playlist_id=video_id, + playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1], + playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]), + ) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c73c4cd6c..7c99cb7e0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -255,6 +255,10 @@ from .commonprotocols import ( from .condenast import CondeNastIE from .contv import CONtvIE from .corus import CorusIE +from .cpac import ( + CPACIE, + CPACPlaylistIE, +) from .cracked import CrackedIE from .crackle import CrackleIE from .crooksandliars import CrooksAndLiarsIE From f8e543c9063c1c7ad157936cb6a15b428ddb3896 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 7 Feb 2022 20:06:27 +0000 Subject: [PATCH 67/70] [Alsace20TV] Add new extractors Alsace20TVIE, Alsace20TVEmbedIE --- youtube_dl/extractor/alsace20tv.py | 89 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 4 ++ 2 files changed, 93 insertions(+) create mode 100644 youtube_dl/extractor/alsace20tv.py diff --git a/youtube_dl/extractor/alsace20tv.py b/youtube_dl/extractor/alsace20tv.py new file mode 100644 index 000000000..228cec3ec --- /dev/null +++ b/youtube_dl/extractor/alsace20tv.py @@ -0,0 +1,89 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + get_element_by_class, + int_or_none, + unified_strdate, + url_or_none, +) + + +class Alsace20TVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P<id>[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'duration': 1073, + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _extract_video(self, video_id, url=None): + info = self._download_json( + 'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ), + video_id) or {} + title = info['titre'] + + formats = [] + for res, fmt_url in (info.get('files') or {}).items(): + formats.extend( + self._extract_smil_formats(fmt_url, video_id, fatal=False) + if '/smil:_' in fmt_url + else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False)) + self._sort_formats(formats) + + webpage = (url and self._download_webpage(url, video_id, fatal=False)) or '' + thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage)) + upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None) + upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': clean_html(get_element_by_class('wysiwyg', webpage)), + 'upload_date': upload_date, + 'thumbnail': thumbnail, + 'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None), + 'view_count': int_or_none(info.get('nb_vues')), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id, url) + + +class Alsace20TVEmbedIE(Alsace20TVIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P<id>[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7c99cb7e0..535080d0a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -51,6 +51,10 @@ from .anvato import AnvatoIE from .aol import AolIE from .allocine import AllocineIE from .aliexpress import AliExpressLiveIE +from .alsace20tv import ( + Alsace20TVIE, + Alsace20TVEmbedIE, +) from .apa import APAIE from .aparat import AparatIE from .appleconnect import AppleConnectIE From 4194d253c0b922addf0439228066cb4fb487bac3 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Fri, 30 Jul 2021 12:58:19 +0100 Subject: [PATCH 68/70] Avoid skipping ID when unlisted_hash is numeric Pattern needed a non-greedy match; also replaced a redundant test with one for this, issue 29690 --- youtube_dl/extractor/vimeo.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 0b386f450..a66912502 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -271,7 +271,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? vimeo(?:pro)?\.com/ (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) - (?:.*?/)? + (?:.*?/)?? (?: (?: play_redirect_hls| @@ -517,14 +517,28 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/7809605', 'only_matching': True, }, - { - 'url': 'https://vimeo.com/160743502/abd0e13fb4', - 'only_matching': True, - }, { # requires passing unlisted_hash(a52724358e) to load_download_config request 'url': 'https://vimeo.com/392479337/a52724358e', 'only_matching': True, + }, + { + # similar, but all numeric: ID must be 581039021, not 9603038895 + # issue #29690 + 'url': 'https://vimeo.com/581039021/9603038895', + 'info_dict': { + 'id': '581039021', + # these have to be provided but we don't care + 'ext': 'mp4', + 'timestamp': 1627621014, + 'title': 're:.+', + 'uploader_id': 're:.+', + 'uploader': 're:.+', + 'upload_date': r're:\d+', + }, + 'params': { + 'skip_download': True, + }, } # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header From 6508688e88c83bb811653083db9351702cd39a6a Mon Sep 17 00:00:00 2001 From: df <fieldhouse@gmx.net> Date: Sun, 1 Aug 2021 09:42:57 +0100 Subject: [PATCH 69/70] Make default upload_/release_date a compat_str Ensures download tests pass in Python 2 as well as 3; also add YoutubeDL tests for timestamp -> upload_date etc. --- test/test_YoutubeDL.py | 19 +++++++++++++++++++ youtube_dl/YoutubeDL.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index a35effe0e..f8c8e619c 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -997,6 +997,25 @@ class TestYoutubeDL(unittest.TestCase): self.assertEqual(downloaded['extractor'], 'Video') self.assertEqual(downloaded['extractor_key'], 'Video') + def test_default_times(self): + """Test addition of missing upload/release/_date from /release_/timestamp""" + info = { + 'id': '1234', + 'url': TEST_URL, + 'title': 'Title', + 'ext': 'mp4', + 'timestamp': 1631352900, + 'release_timestamp': 1632995931, + } + + params = {'simulate': True, } + ydl = FakeYDL(params) + out_info = ydl.process_ie_result(info) + self.assertTrue(isinstance(out_info['upload_date'], compat_str)) + self.assertEqual(out_info['upload_date'], '20210911') + self.assertTrue(isinstance(out_info['release_date'], compat_str)) + self.assertEqual(out_info['release_date'], '20210930') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fe30758ef..69736acff 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1529,7 +1529,7 @@ class YoutubeDL(object): # see http://bugs.python.org/issue1646728) try: upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) - info_dict[date_key] = upload_date.strftime('%Y%m%d') + info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d')) except (ValueError, OverflowError, OSError): pass From 49c5293014bc11ec8c009856cd63cffa6296c1e1 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 22 Feb 2022 11:24:06 +0000 Subject: [PATCH 70/70] Ignore --external-downloader-args if --external-downloader was rejected ... and generate warning --- youtube_dl/YoutubeDL.py | 11 ++++++++++- youtube_dl/downloader/__init__.py | 3 +++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 69736acff..019e309cb 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1906,8 +1906,17 @@ class YoutubeDL(object): if not self.params.get('skip_download', False): try: + def checked_get_suitable_downloader(info_dict, params): + ed_args = params.get('external_downloader_args') + dler = get_suitable_downloader(info_dict, params) + if ed_args and not params.get('external_downloader_args'): + # external_downloader_args was cleared because external_downloader was rejected + self.report_warning('Requested external downloader cannot be used: ' + 'ignoring --external-downloader-args.') + return dler + def dl(name, info): - fd = get_suitable_downloader(info, self.params)(self, self.params) + fd = checked_get_suitable_downloader(info, self.params)(self, self.params) for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index d8f2fa342..d701d6292 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -50,6 +50,9 @@ def _get_suitable_downloader(info_dict, params={}): ed = get_external_downloader(external_downloader) if ed.can_download(info_dict): return ed + # Avoid using unwanted args since external_downloader was rejected + if params.get('external_downloader_args'): + params['external_downloader_args'] = None protocol = info_dict['protocol'] if protocol.startswith('m3u8') and info_dict.get('is_live'):