[arte] Rework extractors

* Reimplement embed and playlist extractors to delegate to the single entrypoint artetv extractor
  Beware reluctant download archive extractor keys breakage.
* Improve embeds detection (closes #27057)
- Remove obsolete code
This commit is contained in:
Sergey M․ 2020-11-19 05:02:04 +07:00
parent 9b505185da
commit 91e954587f
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
3 changed files with 100 additions and 65 deletions

View file

@ -4,7 +4,10 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
@ -14,14 +17,44 @@ from ..utils import (
url_or_none, url_or_none,
) )
# There are different sources of video in arte.tv, the extraction process
# is different for each one. The videos usually expire in 7 days, so we can't
# add tests.
class ArteTVBaseIE(InfoExtractor): class ArteTVBaseIE(InfoExtractor):
def _extract_from_json_url(self, json_url, video_id, lang, title=None): _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
info = self._download_json(json_url, video_id) _API_BASE = 'https://api.arte.tv/api/player/v1'
class ArteTVIE(ArteTVBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
)
/(?P<id>\d{6}-\d{3}-[AF])
''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'info_dict': {
'id': '088501-000-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
},
}, {
'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
'only_matching': True,
}, {
'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
lang = mobj.group('lang') or mobj.group('lang_2')
info = self._download_json(
'%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)
player_info = info['videoJsonPlayer'] player_info = info['videoJsonPlayer']
vsr = try_get(player_info, lambda x: x['VSR'], dict) vsr = try_get(player_info, lambda x: x['VSR'], dict)
@ -38,18 +71,11 @@ class ArteTVBaseIE(InfoExtractor):
if not upload_date_str: if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
title = (player_info.get('VTI') or title or player_info['VID']).strip() title = (player_info.get('VTI') or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip() subtitle = player_info.get('VSU', '').strip()
if subtitle: if subtitle:
title += ' - %s' % subtitle title += ' - %s' % subtitle
info_dict = {
'id': player_info['VID'],
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
}
qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
LANGS = { LANGS = {
@ -64,7 +90,6 @@ class ArteTVBaseIE(InfoExtractor):
langcode = LANGS.get(lang, lang) langcode = LANGS.get(lang, lang)
formats = [] formats = []
m3u8_formats = []
for format_id, format_dict in vsr.items(): for format_id, format_dict in vsr.items():
f = dict(format_dict) f = dict(format_dict)
format_url = url_or_none(f.get('url')) format_url = url_or_none(f.get('url'))
@ -120,6 +145,7 @@ class ArteTVBaseIE(InfoExtractor):
m3u8_id=format_id, fatal=False) m3u8_id=format_id, fatal=False)
for m3u8_format in m3u8_formats: for m3u8_format in m3u8_formats:
m3u8_format['language_preference'] = lang_pref m3u8_format['language_preference'] = lang_pref
formats.extend(m3u8_formats)
continue continue
format = { format = {
@ -142,58 +168,50 @@ class ArteTVBaseIE(InfoExtractor):
formats.append(format) formats.append(format)
self._check_formats(formats, video_id)
formats.extend(m3u8_formats)
self._sort_formats(formats) self._sort_formats(formats)
info_dict['formats'] = formats return {
return info_dict 'id': player_info.get('VID') or video_id,
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
'formats': formats,
}
class ArteTVPlus7IE(ArteTVBaseIE): class ArteTVEmbedIE(InfoExtractor):
IE_NAME = 'arte.tv:+7' _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])'
_TESTS = [{ _TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
'info_dict': { 'info_dict': {
'id': '088501-000-A', 'id': '100605-013-A',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive', 'title': 'United we Stream November Lockdown Edition #13',
'upload_date': '20190628', 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
'upload_date': '20201116',
}, },
}, {
'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}] }]
def _real_extract(self, url): @staticmethod
lang, video_id = re.match(self._VALID_URL, url).groups() def _extract_urls(webpage):
return self._extract_from_json_url( return [url for _, url in re.findall(
'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
video_id, lang) webpage)]
class ArteTVEmbedIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:embed'
_VALID_URL = r'''(?x)
https://www\.arte\.tv
/player/v3/index\.php\?json_url=
(?P<json_url>
https?://api\.arte\.tv/api/player/v1/config/
(?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF])
)
'''
_TESTS = []
def _real_extract(self, url): def _real_extract(self, url):
json_url, lang, video_id = re.match(self._VALID_URL, url).groups() qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
return self._extract_from_json_url(json_url, video_id, lang) json_url = qs['json_url'][0]
video_id = ArteTVIE._match_id(json_url)
return self.url_result(
json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
class ArteTVPlaylistIE(ArteTVBaseIE): class ArteTVPlaylistIE(ArteTVBaseIE):
IE_NAME = 'arte.tv:playlist' _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})'
_TESTS = [{ _TESTS = [{
'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
'info_dict': { 'info_dict': {
@ -202,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
'description': 'md5:d322c55011514b3a7241f7fb80d494c2', 'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
}, },
'playlist_mincount': 6, 'playlist_mincount': 6,
}, {
'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
lang, playlist_id = re.match(self._VALID_URL, url).groups() lang, playlist_id = re.match(self._VALID_URL, url).groups()
collection = self._download_json( collection = self._download_json(
'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' '%s/collectionData/%s/%s?source=videos'
% (lang, playlist_id), playlist_id) % (self._API_BASE, lang, playlist_id), playlist_id)
entries = []
for video in collection['videos']:
if not isinstance(video, dict):
continue
video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl'))
if not video_url:
continue
video_id = video.get('programId')
entries.append({
'_type': 'url_transparent',
'url': video_url,
'id': video_id,
'title': video.get('title'),
'alt_title': video.get('subtitle'),
'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)),
'duration': int_or_none(video.get('durationSeconds')),
'view_count': int_or_none(video.get('views')),
'ie_key': ArteTVIE.ie_key(),
})
title = collection.get('title') title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText') description = collection.get('shortDescription') or collection.get('teaserText')
entries = [
self._extract_from_json_url(
video['jsonUrl'], video.get('programId') or playlist_id, lang)
for video in collection['videos'] if video.get('jsonUrl')]
return self.playlist_result(entries, playlist_id, title, description) return self.playlist_result(entries, playlist_id, title, description)

View file

@ -58,7 +58,7 @@ from .ard import (
ARDMediathekIE, ARDMediathekIE,
) )
from .arte import ( from .arte import (
ArteTVPlus7IE, ArteTVIE,
ArteTVEmbedIE, ArteTVEmbedIE,
ArteTVPlaylistIE, ArteTVPlaylistIE,
) )

View file

@ -91,6 +91,7 @@ from .piksel import PikselIE
from .videa import VideaIE from .videa import VideaIE
from .twentymin import TwentyMinutenIE from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE from .ustream import UstreamIE
from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE from .videopress import VideoPressIE
from .rutube import RutubeIE from .rutube import RutubeIE
from .limelight import LimelightBaseIE from .limelight import LimelightBaseIE
@ -2760,11 +2761,9 @@ class GenericIE(InfoExtractor):
return self.url_result(ustream_url, UstreamIE.ie_key()) return self.url_result(ustream_url, UstreamIE.ie_key())
# Look for embedded arte.tv player # Look for embedded arte.tv player
mobj = re.search( arte_urls = ArteTVEmbedIE._extract_urls(webpage)
r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"', if arte_urls:
webpage) return self.playlist_from_matches(arte_urls, video_id, video_title)
if mobj is not None:
return self.url_result(mobj.group('url'), 'ArteTVEmbed')
# Look for embedded francetv player # Look for embedded francetv player
mobj = re.search( mobj = re.search(