From f8f3b6f2bf5673d96c5743de008bd9f9214156fe Mon Sep 17 00:00:00 2001 From: Grabien Date: Mon, 27 Sep 2021 22:22:39 +0300 Subject: [PATCH] C-Span links support improvements --- youtube_dl/extractor/cspan.py | 33 +++++++++++++++++++++++++++++---- youtube_dl/extractor/generic.py | 11 +++++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 2e01aff48..fbd0e7237 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -1,11 +1,14 @@ from __future__ import unicode_literals +import os import re -from .common import InfoExtractor +from .common import InfoExtractor, compat_str, compat_urllib_parse_unquote +from .senateisvp import SenateISVPIE +from .ustream import UstreamIE from ..utils import ( - determine_ext, ExtractorError, + determine_ext, extract_attributes, find_xpath_attr, get_element_by_attribute, @@ -18,11 +21,10 @@ from ..utils import ( str_to_int, unescapeHTML, ) -from .senateisvp import SenateISVPIE -from .ustream import UstreamIE class CSpanIE(InfoExtractor): + _BASIC_URL = r'https?://(?:www\.)?c-span\.org/' _VALID_URL = r'https?://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' IE_DESC = 'C-SPAN' _TESTS = [{ @@ -81,6 +83,29 @@ class CSpanIE(InfoExtractor): }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' + @classmethod + def is_basic_url(cls, url): + if '_BASIC_URL_RE' not in cls.__dict__: + cls._BASIC_URL_RE = re.compile(cls._BASIC_URL) + + return cls._BASIC_URL_RE.match(url) is not None + + @classmethod + def get_basic_url(cls, url): + return "/".join(url.rstrip('/').split('/')[:3]) + + @classmethod + def _get_id(cls, url): + if '_ID_URL_RE' not in cls.__dict__: + cls._ID_URL_RE = re.compile(cls._ID_URL) + + if cls._ID_URL_RE.match(url) is not None: + m = cls._ID_URL_RE.match(url) + assert m + return compat_str(m.group('id')) + else: + return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + def _real_extract(self, url): video_id = self._match_id(url) video_type = None diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a9c064105..ef4e0b96e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -48,6 +48,7 @@ from .nexx import ( NexxIE, NexxEmbedIE, ) +from .cspan import CSpanIE from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -3551,6 +3552,16 @@ class GenericIE(InfoExtractor): if entry_info_dict.get('formats'): self._sort_formats(entry_info_dict['formats']) + if CSpanIE.is_basic_url(url): + basic_url = CSpanIE.get_basic_url(url) + for f in entry_info_dict['formats']: + f.setdefault('http_headers', {})['referer'] = basic_url + "/" + f.setdefault('http_headers', {})['origin'] = basic_url + f.setdefault('http_headers', {})['accept'] = "*/*" + f.setdefault('http_headers', {})['sec-fetch-dest'] = "empty" + f.setdefault('http_headers', {})['sec-fetch-mode'] = "cors" + f.setdefault('http_headers', {})['sec-fetch-site'] = "cross-site" + entries.append(entry_info_dict) if len(entries) == 1: