[extractor/common] Add support for AMP tags in _parse_html5_media_entries

This commit is contained in:
Sergey M․ 2017-07-09 16:29:52 +07:00
parent 250b042c7e
commit 4328ddf82b
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
2 changed files with 15 additions and 2 deletions

View file

@ -2132,15 +2132,18 @@ class InfoExtractor(object):
return is_plain_url, formats return is_plain_url, formats
entries = [] entries = []
# amp-video and amp-audio are very similar to their HTML5 counterparts
# so we wll include them right here (see
# https://www.ampproject.org/docs/reference/components/amp-video)
media_tags = [(media_tag, media_type, '') media_tags = [(media_tag, media_type, '')
for media_tag, media_type for media_tag, media_type
in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
media_tags.extend(re.findall( media_tags.extend(re.findall(
# We only allow video|audio followed by a whitespace or '>'. # We only allow video|audio followed by a whitespace or '>'.
# Allowing more characters may end up in significant slow down (see # Allowing more characters may end up in significant slow down (see
# https://github.com/rg3/youtube-dl/issues/11979, example URL: # https://github.com/rg3/youtube-dl/issues/11979, example URL:
# http://www.porntrex.com/maps/videositemap.xml). # http://www.porntrex.com/maps/videositemap.xml).
r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage)) r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
for media_tag, media_type, media_content in media_tags: for media_tag, media_type, media_content in media_tags:
media_info = { media_info = {
'formats': [], 'formats': [],

View file

@ -1770,6 +1770,16 @@ class GenericIE(InfoExtractor):
}, },
'add_ie': [MediasetIE.ie_key()], 'add_ie': [MediasetIE.ie_key()],
}, },
{
# AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video)
'url': 'https://tvrain.ru/amp/418921/',
'md5': 'cc00413936695987e8de148b67d14f1d',
'info_dict': {
'id': '418921',
'ext': 'mp4',
'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
},
},
# { # {
# # TODO: find another test # # TODO: find another test
# # http://schema.org/VideoObject # # http://schema.org/VideoObject