[ted] Fix extraction (closes #13535))

This commit is contained in:
Sergey M․ 2017-07-01 18:39:01 +07:00
parent 54faac2235
commit 4917478803
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -6,7 +6,10 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import compat_str
from ..utils import int_or_none from ..utils import (
int_or_none,
try_get,
)
class TEDIE(InfoExtractor): class TEDIE(InfoExtractor):
@ -113,8 +116,9 @@ class TEDIE(InfoExtractor):
} }
def _extract_info(self, webpage): def _extract_info(self, webpage):
info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>', info_json = self._search_regex(
webpage, 'info json') r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>',
webpage, 'info json')
return json.loads(info_json) return json.loads(info_json)
def _real_extract(self, url): def _real_extract(self, url):
@ -136,11 +140,16 @@ class TEDIE(InfoExtractor):
webpage = self._download_webpage(url, name, webpage = self._download_webpage(url, name,
'Downloading playlist webpage') 'Downloading playlist webpage')
info = self._extract_info(webpage) info = self._extract_info(webpage)
playlist_info = info['playlist']
playlist_info = try_get(
info, lambda x: x['__INITIAL_DATA__']['playlist'],
dict) or info['playlist']
playlist_entries = [ playlist_entries = [
self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key()) self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
for talk in info['talks'] for talk in try_get(
info, lambda x: x['__INITIAL_DATA__']['talks'],
dict) or info['talks']
] ]
return self.playlist_result( return self.playlist_result(
playlist_entries, playlist_entries,
@ -149,9 +158,14 @@ class TEDIE(InfoExtractor):
def _talk_info(self, url, video_name): def _talk_info(self, url, video_name):
webpage = self._download_webpage(url, video_name) webpage = self._download_webpage(url, video_name)
self.report_extraction(video_name)
talk_info = self._extract_info(webpage)['talks'][0] info = self._extract_info(webpage)
talk_info = try_get(
info, lambda x: x['__INITIAL_DATA__']['talks'][0],
dict) or info['talks'][0]
title = talk_info['title'].strip()
external = talk_info.get('external') external = talk_info.get('external')
if external: if external:
@ -165,19 +179,27 @@ class TEDIE(InfoExtractor):
'url': ext_url or external['uri'], 'url': ext_url or external['uri'],
} }
native_downloads = try_get(
talk_info, lambda x: x['downloads']['nativeDownloads'],
dict) or talk_info['nativeDownloads']
formats = [{ formats = [{
'url': format_url, 'url': format_url,
'format_id': format_id, 'format_id': format_id,
'format': format_id, 'format': format_id,
} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None] } for (format_id, format_url) in native_downloads.items() if format_url is not None]
if formats: if formats:
for f in formats: for f in formats:
finfo = self._NATIVE_FORMATS.get(f['format_id']) finfo = self._NATIVE_FORMATS.get(f['format_id'])
if finfo: if finfo:
f.update(finfo) f.update(finfo)
player_talk = talk_info['player_talks'][0]
resources_ = player_talk.get('resources') or talk_info.get('resources')
http_url = None http_url = None
for format_id, resources in talk_info['resources'].items(): for format_id, resources in resources_.items():
if format_id == 'h264': if format_id == 'h264':
for resource in resources: for resource in resources:
h264_url = resource.get('file') h264_url = resource.get('file')
@ -237,14 +259,11 @@ class TEDIE(InfoExtractor):
video_id = compat_str(talk_info['id']) video_id = compat_str(talk_info['id'])
thumbnail = talk_info['thumb']
if not thumbnail.startswith('http'):
thumbnail = 'http://' + thumbnail
return { return {
'id': video_id, 'id': video_id,
'title': talk_info['title'].strip(), 'title': title,
'uploader': talk_info['speaker'], 'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
'thumbnail': thumbnail, 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
'description': self._og_search_description(webpage), 'description': self._og_search_description(webpage),
'subtitles': self._get_subtitles(video_id, talk_info), 'subtitles': self._get_subtitles(video_id, talk_info),
'formats': formats, 'formats': formats,