[wdr] Extent subtitles extraction and improve overall extraction (closes #22672, closes #22723)

This commit is contained in:
Sergey M․ 2020-12-12 04:48:56 +07:00
parent 59fea63dc2
commit 3ae02d4a62
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -17,6 +17,7 @@ from ..utils import (
unified_strdate, unified_strdate,
update_url_query, update_url_query,
urlhandle_detect_ext, urlhandle_detect_ext,
url_or_none,
) )
@ -42,16 +43,20 @@ class WDRIE(InfoExtractor):
is_live = metadata.get('mediaType') == 'live' is_live = metadata.get('mediaType') == 'live'
tracker_data = metadata['trackerData'] tracker_data = metadata['trackerData']
title = tracker_data['trackerClipTitle']
media_resource = metadata['mediaResource'] media_resource = metadata['mediaResource']
formats = [] formats = []
# check if the metadata contains a direct URL to a file # check if the metadata contains a direct URL to a file
for kind, media_resource in media_resource.items(): for kind, media in media_resource.items():
if not isinstance(media, dict):
continue
if kind not in ('dflt', 'alt'): if kind not in ('dflt', 'alt'):
continue continue
for tag_name, medium_url in media_resource.items(): for tag_name, medium_url in media.items():
if tag_name not in ('videoURL', 'audioURL'): if tag_name not in ('videoURL', 'audioURL'):
continue continue
@ -88,8 +93,16 @@ class WDRIE(InfoExtractor):
'url': caption_url, 'url': caption_url,
'ext': 'ttml', 'ext': 'ttml',
}] }]
captions_hash = media_resource.get('captionsHash')
title = tracker_data['trackerClipTitle'] if isinstance(captions_hash, dict):
for ext, format_url in captions_hash.items():
format_url = url_or_none(format_url)
if not format_url:
continue
subtitles.setdefault('de', []).append({
'url': format_url,
'ext': determine_ext(format_url, None) or ext,
})
return { return {
'id': tracker_data.get('trackerClipId', video_id), 'id': tracker_data.get('trackerClipId', video_id),