From ec75141bf03b66ffc183992ec6bb50c4ff392dd8 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 2 May 2025 13:17:18 +0100 Subject: [PATCH 1/6] [Cache] Add `clear` function --- test/test_cache.py | 12 ++++++++++++ youtube_dl/cache.py | 42 +++++++++++++++++++++++++++++++----------- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/test/test_cache.py b/test/test_cache.py index 21dfb6cfc..0431f4f15 100644 --- a/test/test_cache.py +++ b/test/test_cache.py @@ -66,6 +66,18 @@ class TestCache(unittest.TestCase): new_version = '.'.join(('%0.2d' % ((v + 1) if i == 0 else v, )) for i, v in enumerate(version_tuple(__version__))) self.assertIs(c.load('test_cache', 'k.', min_ver=new_version), None) + def test_cache_clear(self): + ydl = FakeYDL({ + 'cachedir': self.test_dir, + }) + c = Cache(ydl) + c.store('test_cache', 'k.', 'kay') + c.store('test_cache', 'l.', 'ell') + self.assertEqual(c.load('test_cache', 'k.'), 'kay') + c.clear('test_cache', 'k.') + self.assertEqual(c.load('test_cache', 'k.'), None) + self.assertEqual(c.load('test_cache', 'l.'), 'ell') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py index ba12d0373..eb0a729c2 100644 --- a/youtube_dl/cache.py +++ b/youtube_dl/cache.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import errno import json import os import re @@ -8,7 +9,6 @@ import shutil import traceback from .compat import ( - compat_contextlib_suppress, compat_getenv, compat_open as open, compat_os_makedirs, @@ -78,6 +78,22 @@ class Cache(object): tb = traceback.format_exc() self._report_warning('Writing cache to {fn!r} failed: {tb}'.format(fn=fn, tb=tb)) + def clear(self, section, key, dtype='json'): + + if not self.enabled: + return + + fn = self._get_cache_fn(section, key, dtype) + self._write_debug('Clearing {section}.{key} from cache'.format(section=section, key=key)) + try: + os.remove(fn) + except Exception as e: + if getattr(e, 'errno') == errno.ENOENT: + # file not found + return + tb = traceback.format_exc() + self._report_warning('Clearing cache from {fn!r} failed: {tb}'.format(fn=fn, tb=tb)) + def _validate(self, data, min_ver): version = traverse_obj(data, self._VERSION_KEY) if not version: # Backward compatibility @@ -94,17 +110,21 @@ class Cache(object): return default cache_fn = self._get_cache_fn(section, key, dtype) - with compat_contextlib_suppress(IOError): # If no cache available + try: + with open(cache_fn, encoding='utf-8') as cachef: + self._write_debug('Loading {section}.{key} from cache'.format(section=section, key=key), only_once=True) + return self._validate(json.load(cachef), min_ver) + except (ValueError, KeyError): try: - with open(cache_fn, encoding='utf-8') as cachef: - self._write_debug('Loading {section}.{key} from cache'.format(section=section, key=key), only_once=True) - return self._validate(json.load(cachef), min_ver) - except (ValueError, KeyError): - try: - file_size = os.path.getsize(cache_fn) - except (OSError, IOError) as oe: - file_size = error_to_compat_str(oe) - self._report_warning('Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) + file_size = 'size: %d' % os.path.getsize(cache_fn) + except (OSError, IOError) as oe: + file_size = error_to_compat_str(oe) + self._report_warning('Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) + except Exception as e: + if getattr(e, 'errno') == errno.ENOENT: + # no cache available + return + self._report_warning('Cache retrieval from %s failed' % (cache_fn,)) return default From 3a42f6ad372935733150159b0447e790c037c1af Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 2 May 2025 13:31:45 +0100 Subject: [PATCH 2/6] [YouTube] Cache signature timestamp from player JS * if the YT webpage can't be loaded, getting the `sts` requires loading the player JS: this caches it * based on yt-dlp/yt-dlp#13047, thx bashonly --- youtube_dl/extractor/youtube.py | 142 +++++++++++++++++++++++--------- 1 file changed, 104 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 54073ef86..a1c9a6eae 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -49,6 +49,7 @@ from ..utils import ( parse_duration, parse_qs, qualities, + remove_end, remove_start, smuggle_url, str_or_none, @@ -1584,6 +1585,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, } + _PLAYER_JS_VARIANT_MAP = ( + ('main', 'player_ias.vflset/en_US/base.js'), + ('tce', 'player_ias_tce.vflset/en_US/base.js'), + ('tv', 'tv-player-ias.vflset/tv-player-ias.js'), + ('tv_es6', 'tv-player-es6.vflset/tv-player-es6.js'), + ('phone', 'player-plasma-ias-phone-en_US.vflset/base.js'), + ('tablet', 'player-plasma-ias-tablet-en_US.vflset/base.js'), + ) + @classmethod def suitable(cls, url): if parse_qs(url).get('list', [None])[0]: @@ -1631,36 +1641,83 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError( 'Cannot identify player %r' % (player_url,), cause=e) - def _load_player(self, video_id, player_url, fatal=True, player_id=None): - if not player_id: + def _player_js_cache_key(self, player_url, extra_id=None, _cache={}): + if player_url not in _cache: player_id = self._extract_player_info(player_url) - if player_id not in self._code_cache: + player_path = remove_start( + compat_urllib_parse.urlparse(player_url).path, + '/s/player/{0}/'.format(player_id)) + variant = next((k for k, v in self._PLAYER_JS_VARIANT_MAP + if v == player_path), None) + if not variant: + variant = next( + (k for k, v in self._PLAYER_JS_VARIANT_MAP + if re.match(re.escape(v).replace('en_US', r'\w+') + '$', player_path)), + None) + if not variant: + self.write_debug( + 'Unable to determine player JS variant\n' + ' player = {0}'.format(player_url), only_once=True) + variant = re.sub(r'[^a-zA-Z0-9]', '_', remove_end(player_path, '.js')) + _cache[player_url] = join_nonempty(player_id, variant) + + if extra_id: + extra_id = '-'.join((_cache[player_url], extra_id)) + assert os.path.basename(extra_id) == extra_id + return extra_id + return _cache[player_url] + + def _load_player(self, video_id, player_url, fatal=True): + player_js_key = self._player_js_cache_key(player_url) + if player_js_key not in self._code_cache: code = self._download_webpage( player_url, video_id, fatal=fatal, - note='Downloading player ' + player_id, - errnote='Download of %s failed' % player_url) + note='Downloading player {0}'.format(player_js_key), + errnote='Download of {0} failed'.format(player_url)) if code: - self._code_cache[player_id] = code - return self._code_cache[player_id] if fatal else self._code_cache.get(player_id) + self._code_cache[player_js_key] = code + return self._code_cache.get(player_js_key) + + def _load_player_data_from_cache(self, name, player_url, extra_id=None): + cache_id = ('youtube-{0}'.format(name), self._player_js_cache_key(player_url, extra_id)) + data = self._player_cache.get(cache_id) + if data: + return data + + data = self.cache.load(*cache_id, min_ver='2025.04.07') + if data: + self._player_cache[cache_id] = data + return data + + def _store_player_data_to_cache(self, name, player_url, data, extra_id=None): + cache_id = ('youtube-{0}'.format(name), self._player_js_cache_key(player_url, extra_id)) + + if cache_id not in self._player_cache: + self.cache.store(cache_id[0], cache_id[1], data) + self._player_cache[cache_id] = data def _extract_signature_function(self, video_id, player_url, example_sig): - player_id = self._extract_player_info(player_url) + # player_id = self._extract_player_info(player_url) # Read from filesystem cache - func_id = 'js_{0}_{1}'.format( - player_id, self._signature_cache_id(example_sig)) - assert os.path.basename(func_id) == func_id - - self.write_debug('Extracting signature function {0}'.format(func_id)) - cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.04.07'), None + extra_id = self._signature_cache_id(example_sig) + self.write_debug('Extracting signature function {0}-{1}'.format(player_url, extra_id)) + cache_spec, code = self._load_player_data_from_cache( + 'sigfuncs', player_url, extra_id=extra_id, min_ver='2025.04.07' + ), None if not cache_spec: - code = self._load_player(video_id, player_url, player_id) - if code: - res = self._parse_sig_js(code) - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_spec = [ord(c) for c in res(test_string)] - self.cache.store('youtube-sigfuncs', func_id, cache_spec) + code = self._load_player(video_id, player_url) + if code: + res = self._parse_sig_js(code) + test_string = ''.join(map(compat_chr, range(len(example_sig)))) + cache_spec = [ord(c) for c in res(test_string)] + self._store_player_data_to_cache( + 'sigfuncs', player_url, cache_spec, extra_id=extra_id) + else: + self.report_warning( + 'Failed to compute signature function {0}-{1}'.format( + player_url, extra_id)) return lambda s: ''.join(s[i] for i in cache_spec) @@ -1885,22 +1942,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.04.07') + func_code = self._load_player_data_from_cache('nsig', player_url) jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) if func_code: return jsi, player_id, func_code - return self._extract_n_function_code_jsi(video_id, jsi, player_id) - def _extract_n_function_code_jsi(self, video_id, jsi, player_id=None): + return self._extract_n_function_code_jsi(video_id, jsi, player_id, player_url) + def _extract_n_function_code_jsi(self, video_id, jsi, player_id=None, player_url=None): func_name = self._extract_n_function_name(jsi.code) func_code = self._extract_sig_fn(jsi, func_name) - - if player_id: - self.cache.store('youtube-nsig', player_id, func_code) + if player_url: + self._store_player_data_to_cache('nsig', player_url, func_code) return jsi, player_id, func_code def _extract_n_function_from_code(self, jsi, func_code): @@ -1944,18 +2000,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): Required to tell API what sig/player version is in use. """ sts = traverse_obj(ytcfg, 'STS', expected_type=int) - if not sts: - # Attempt to extract from player - if player_url is None: - error_msg = 'Cannot extract signature timestamp without player_url.' - if fatal: - raise ExtractorError(error_msg) - self.report_warning(error_msg) - return - code = self._load_player(video_id, player_url, fatal=fatal) - sts = int_or_none(self._search_regex( - r'(?:signatureTimestamp|sts)\s*:\s*(?P[0-9]{5})', code or '', - 'JS player signature timestamp', group='sts', fatal=fatal)) + if sts: + return sts + + if not player_url: + error_msg = 'Cannot extract signature timestamp without player url' + if fatal: + raise ExtractorError(error_msg) + self.report_warning(error_msg) + return None + + sts = self._load_player_data_from_cache('sts', player_url) + if sts: + return sts + + # Attempt to extract from player + code = self._load_player(video_id, player_url, fatal=fatal) + sts = int_or_none(self._search_regex( + r'(?:signatureTimestamp|sts)\s*:\s*(?P[0-9]{5})', code or '', + 'JS player signature timestamp', group='sts', fatal=fatal)) + if sts: + self._store_player_data_to_cache('sts', player_url, sts) + return sts def _mark_watched(self, video_id, player_response): From 4a31290ae14705100814b88f158e7876f83a5b67 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 2 May 2025 13:38:32 +0100 Subject: [PATCH 3/6] [YouTube] Delete cached problem nsig cache data on descrambling error * inspired by yt-dlp/yt-dlp#12750 --- youtube_dl/extractor/youtube.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a1c9a6eae..0d6ffa3f2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1696,6 +1696,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.cache.store(cache_id[0], cache_id[1], data) self._player_cache[cache_id] = data + def _remove_player_data_from_cache(self, name, player_url, extra_id=None): + cache_id = ('youtube-{0}'.format(name), self._player_js_cache_key(player_url, extra_id)) + + if cache_id in self._player_cache: + self.cache.clear(*cache_id) + self._player_cache.pop(cache_id, None) + def _extract_signature_function(self, video_id, player_url, example_sig): # player_id = self._extract_player_info(player_url) @@ -1989,7 +1996,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): n_param = n_param[-1] n_response = decrypt_nsig(n_param)(n_param, video_id, player_url) if n_response is None: - # give up if descrambling failed + # give up and forget cached data if descrambling failed + self._remove_player_data_from_cache('nsig', player_url) break fmt['url'] = update_url_query(fmt['url'], {'n': n_response}) From 680069a14958187cf576ed91c96d2f273aea711f Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 2 May 2025 13:49:05 +0100 Subject: [PATCH 4/6] [YouTube] Improve n-sig function extraction for player `aa3fc80b` Resolves #33123 --- test/test_youtube_signature.py | 4 ++++ youtube_dl/extractor/youtube.py | 34 +++++++++++++++++++++------------ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index a2d3a41ae..98221b9c2 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -346,6 +346,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', ), + ( + 'https://www.youtube.com/s/player/aa3fc80b/player_ias.vflset/en_US/base.js', + '0qY9dal2uzOnOGwa-48hha', 'VSh1KDfQMk-eag', + ), ] diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0d6ffa3f2..7290ae813 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1878,6 +1878,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_n_function_name(self, jscode): func_name, idx = None, None + + def generic_n_function_search(func_name=None): + return self._search_regex( + r'''(?xs) + (?:(?<=[^\w$])|^) # instead of \b, which ignores $ + (?P%s)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\) + \s*\{(?:(?!};).)+?(?: + ["']enhanced_except_ | + return\s*(?P"|')[a-zA-Z\d-]+_w8_(?P=q)\s*\+\s*[\w$]+ + ) + ''' % (func_name or r'(?!\d)[a-zA-Z\d_$]+',), jscode, + 'Initial JS player n function name', group='name', + default=None if func_name else NO_DEFAULT) + # these special cases are redundant and probably obsolete (2025-04): # they make the tests run ~10% faster without fallback warnings r""" @@ -1918,26 +1932,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?(idx)|\[\s*)(?P(?!\d)[\w$]+)(?(idx)|\s*\]) \s*?[;\n] ''', jscode): - func_name = self._search_regex( + fn = self._search_regex( r'[;,]\s*(function\s+)?({0})(?(1)|\s*=\s*function)\s*\((?!\d)[\w$]+\)\s*\{1}(?!\s*return\s)'.format( re.escape(m.group('nfunc')), '{'), jscode, 'Initial JS player n function name (2)', group=2, default=None) - if func_name: + if fn: + func_name = fn idx = m.group('idx') - break + if generic_n_function_search(func_name): + # don't look any further + break # thx bashonly: yt-dlp/yt-dlp/pull/10611 if not func_name: self.report_warning('Falling back to generic n function search', only_once=True) - return self._search_regex( - r'''(?xs) - (?:(?<=[^\w$])|^) # instead of \b, which ignores $ - (?P(?!\d)[a-zA-Z\d_$]+)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\) - \s*\{(?:(?!};).)+?(?: - ["']enhanced_except_ | - return\s*(?P"|')[a-zA-Z\d-]+_w8_(?P=q)\s*\+\s*[\w$]+ - ) - ''', jscode, 'Initial JS player n function name', group='name') + return generic_n_function_search() + if not idx: return func_name From e102b9993af6defb4e03699840b01c63e5623276 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 3 May 2025 18:33:39 +0100 Subject: [PATCH 5/6] [workflows/ci.yml] Move pinned Ubuntu runner images from withdrawn 20.4 to 22.04 * fix consequent missing `python-is-python2` package --- .github/workflows/ci.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d3b9ae016..8234e0ccb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -116,7 +116,7 @@ jobs: strategy: fail-fast: true matrix: - os: [ubuntu-20.04] + os: [ubuntu-22.04] python-version: ${{ fromJSON(needs.select.outputs.cpython-versions) }} python-impl: [cpython] ytdl-test-set: ${{ fromJSON(needs.select.outputs.test-set) }} @@ -133,12 +133,12 @@ jobs: ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download' || 'nodownload' }} run-tests-ext: bat # jython - - os: ubuntu-20.04 + - os: ubuntu-22.04 python-version: 2.7 python-impl: jython ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'core') && 'core' || 'nocore' }} run-tests-ext: sh - - os: ubuntu-20.04 + - os: ubuntu-22.04 python-version: 2.7 python-impl: jython ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download' || 'nodownload' }} @@ -160,7 +160,7 @@ jobs: # NB may run apt-get install in Linux uses: ytdl-org/setup-python@v1 env: - # Temporary workaround for Python 3.5 failures - May 2024 + # Temporary (?) workaround for Python 3.5 failures - May 2024 PIP_TRUSTED_HOST: "pypi.python.org pypi.org files.pythonhosted.org" with: python-version: ${{ matrix.python-version }} @@ -240,7 +240,10 @@ jobs: # install 2.7 shell: bash run: | - sudo apt-get install -y python2 python-is-python2 + # Ubuntu 22.04 no longer has python-is-python2: fetch it + curl -L "http://launchpadlibrarian.net/474693132/python-is-python2_2.7.17-4_all.deb" -o python-is-python2.deb + sudo apt-get install -y python2 + sudo dpkg --force-breaks -i python-is-python2.deb echo "PYTHONHOME=/usr" >> "$GITHUB_ENV" #-------- Python 2.6 -- - name: Set up Python 2.6 environment From a084c80f7bac9ae343075a97cc0fb2c1c96ade89 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 4 May 2025 12:51:54 +0100 Subject: [PATCH 6/6] [YouTube] Fix 680069a, excess `min_ver` Resolves #33125. --- youtube_dl/extractor/youtube.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7290ae813..b31798729 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1710,8 +1710,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): extra_id = self._signature_cache_id(example_sig) self.write_debug('Extracting signature function {0}-{1}'.format(player_url, extra_id)) cache_spec, code = self._load_player_data_from_cache( - 'sigfuncs', player_url, extra_id=extra_id, min_ver='2025.04.07' - ), None + 'sigfuncs', player_url, extra_id=extra_id), None if not cache_spec: code = self._load_player(video_id, player_url)