[extractor/common] Fix inline HTML5 media tags processing and add test (closes #27345)

This commit is contained in:
Sergey M․ 2020-12-09 00:05:21 +07:00
parent e2bdf8bf4f
commit 5a1fbbf8b7
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
2 changed files with 15 additions and 3 deletions

View file

@ -108,6 +108,18 @@ class TestInfoExtractor(unittest.TestCase):
self.assertEqual(self.ie._download_json(uri, None, fatal=False), None)
def test_parse_html5_media_entries(self):
# inline video tag
expect_dict(
self,
self.ie._parse_html5_media_entries(
'https://127.0.0.1/video.html',
r'<html><video src="/vid.mp4" /></html>', None)[0],
{
'formats': [{
'url': 'https://127.0.0.1/vid.mp4',
}],
})
# from https://www.r18.com/
# with kpbs in label
expect_dict(

View file

@ -2515,9 +2515,9 @@ class InfoExtractor(object):
# https://www.ampproject.org/docs/reference/components/amp-video)
# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
media_tags = [(media_tag, media_type, '')
for media_tag, media_type
in re.findall(r'(?s)(<%s[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
media_tags = [(media_tag, media_tag_name, media_type, '')
for media_tag, media_tag_name, media_type
in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
media_tags.extend(re.findall(
# We only allow video|audio followed by a whitespace or '>'.
# Allowing more characters may end up in significant slow down (see