[tagesschau] Fix audio support

This commit is contained in:
Sergey M․ 2016-05-01 04:38:46 +06:00
parent 4c1b2e5c0e
commit 1a2b377cc2
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -4,7 +4,10 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import parse_filesize from ..utils import (
determine_ext,
parse_filesize,
)
class TagesschauIE(InfoExtractor): class TagesschauIE(InfoExtractor):
@ -82,37 +85,54 @@ class TagesschauIE(InfoExtractor):
'xxl': {'quality': 5}, 'xxl': {'quality': 5},
} }
def _extract_formats(self, download_text): def _extract_formats(self, download_text, media_kind):
links = re.finditer( links = re.finditer(
r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
download_text) download_text)
formats = [] formats = []
for l in links: for l in links:
link_url = l.group('url')
if not link_url:
continue
format_id = self._search_regex( format_id = self._search_regex(
r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID',
default=determine_ext(link_url))
format = { format = {
'format_id': format_id, 'format_id': format_id,
'url': l.group('url'), 'url': l.group('url'),
'format_name': l.group('name'), 'format_name': l.group('name'),
} }
m = re.match( title = l.group('title')
r'''(?x) if title:
Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; if media_kind.lower() == 'video':
(?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; m = re.match(
(?P<vbr>[0-9]+)kbps&\#10; r'''(?x)
Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
l.group('title')) (?P<vbr>[0-9]+)kbps&\#10;
if m: Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
format.update({ Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
'format_note': m.group('audio_desc'), title)
'vcodec': m.group('vcodec'), if m:
'width': int(m.group('width')), format.update({
'height': int(m.group('height')), 'format_note': m.group('audio_desc'),
'abr': int(m.group('abr')), 'vcodec': m.group('vcodec'),
'vbr': int(m.group('vbr')), 'width': int(m.group('width')),
'filesize_approx': parse_filesize(m.group('filesize_approx')), 'height': int(m.group('height')),
}) 'abr': int(m.group('abr')),
'vbr': int(m.group('vbr')),
'filesize_approx': parse_filesize(m.group('filesize_approx')),
})
else:
m = re.match(
r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)',
title)
if m:
format.update({
'format_note': '%s, %s' % (m.group('format'), m.group('note')),
'vcodec': 'none',
'abr': int(m.group('abr')),
})
formats.append(format) formats.append(format)
self._sort_formats(formats) self._sort_formats(formats)
return formats return formats
@ -154,23 +174,26 @@ class TagesschauIE(InfoExtractor):
title = self._html_search_regex( title = self._html_search_regex(
r'<span class="headline".*?>(.*?)</span>', webpage, 'title') r'<span class="headline".*?>(.*?)</span>', webpage, 'title')
DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>' DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>'
webpage_type = self._og_search_property('type', webpage, default=None) webpage_type = self._og_search_property('type', webpage, default=None)
if webpage_type == 'website': # Article if webpage_type == 'website': # Article
entries = [] entries = []
for num, (entry_title, download_text) in enumerate(re.findall( for num, (entry_title, media_kind, download_text) in enumerate(re.findall(
r'(?s)<p[^>]+class="infotext"[^>]*>.*?<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX, r'(?s)<p[^>]+class="infotext"[^>]*>.*?<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX,
webpage)): webpage)):
entries.append({ entries.append({
'id': display_id, 'id': display_id,
'title': '%s-%d' % (entry_title, num), 'title': '%s-%d' % (entry_title, num),
'formats': self._extract_formats(download_text), 'formats': self._extract_formats(download_text, media_kind),
}) })
return self.playlist_result(entries, display_id, title) return self.playlist_result(entries, display_id, title)
else: # Assume single video else: # Assume single video
download_text = self._search_regex(DOWNLOAD_REGEX, webpage, 'download links') download_text = self._search_regex(
formats = self._extract_formats(download_text) DOWNLOAD_REGEX, webpage, 'download links', group='links')
media_kind = self._search_regex(
DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='links')
formats = self._extract_formats(download_text, media_kind)
thumbnail = self._og_search_thumbnail(webpage) thumbnail = self._og_search_thumbnail(webpage)
description = self._html_search_regex( description = self._html_search_regex(
r'(?s)<p class="teasertext">(.*?)</p>', r'(?s)<p class="teasertext">(.*?)</p>',