Correct XML ampersand fixup

This commit is contained in:
Philipp Hagemeister 2014-01-20 22:11:34 +01:00
parent b853d2e155
commit 5aafe895fc
5 changed files with 25 additions and 10 deletions

View file

@ -16,6 +16,7 @@ from youtube_dl.utils import (
DateRange, DateRange,
encodeFilename, encodeFilename,
find_xpath_attr, find_xpath_attr,
fix_xml_ampersands,
get_meta_content, get_meta_content,
orderedSet, orderedSet,
parse_duration, parse_duration,
@ -200,5 +201,18 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_duration('9:12:43'), 33163) self.assertEqual(parse_duration('9:12:43'), 33163)
self.assertEqual(parse_duration('x:y'), None) self.assertEqual(parse_duration('x:y'), None)
def test_fix_xml_ampersands(self):
self.assertEqual(
fix_xml_ampersands('"&x=y&z=a'), '"&x=y&z=a')
self.assertEqual(
fix_xml_ampersands('"&x=y&wrong;&z=a'),
'"&x=y&wrong;&z=a')
self.assertEqual(
fix_xml_ampersands('&'><"'),
'&'><"')
self.assertEqual(
fix_xml_ampersands('Ӓ᪼'), 'Ӓ᪼')
self.assertEqual(fix_xml_ampersands('&#&#'), '&#&#')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View file

@ -3,7 +3,7 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
find_xpath_attr, find_xpath_attr,
fix_xml_all_ampersand, fix_xml_ampersands
) )
@ -33,7 +33,7 @@ class ClipsyndicateIE(InfoExtractor):
pdoc = self._download_xml( pdoc = self._download_xml(
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
video_id, u'Downloading video info', video_id, u'Downloading video info',
transform_source=fix_xml_all_ampersand) transform_source=fix_xml_ampersands)
track_doc = pdoc.find('trackList/track') track_doc = pdoc.find('trackList/track')
def find_param(name): def find_param(name):

View file

@ -4,7 +4,7 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
fix_xml_all_ampersand, fix_xml_ampersands,
) )
@ -27,7 +27,7 @@ class MetacriticIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
# The xml is not well formatted, there are raw '&' # The xml is not well formatted, there are raw '&'
info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
video_id, 'Downloading info xml', transform_source=fix_xml_all_ampersand) video_id, 'Downloading info xml', transform_source=fix_xml_ampersands)
clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
formats = [] formats = []

View file

@ -5,6 +5,7 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
compat_urllib_parse, compat_urllib_parse,
ExtractorError, ExtractorError,
fix_xml_ampersands,
) )
def _media_xml_tag(tag): def _media_xml_tag(tag):
@ -83,12 +84,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
video_id = self._id_from_uri(uri) video_id = self._id_from_uri(uri)
data = compat_urllib_parse.urlencode({'uri': uri}) data = compat_urllib_parse.urlencode({'uri': uri})
def fix_ampersand(s):
""" Fix unencoded ampersand in XML """
return s.replace(u'& ', '& ')
idoc = self._download_xml( idoc = self._download_xml(
self._FEED_URL + '?' + data, video_id, self._FEED_URL + '?' + data, video_id,
u'Downloading info', transform_source=fix_ampersand) u'Downloading info', transform_source=fix_xml_ampersands)
return [self._get_video_info(item) for item in idoc.findall('.//item')] return [self._get_video_info(item) for item in idoc.findall('.//item')]

View file

@ -1092,9 +1092,12 @@ def month_by_name(name):
return None return None
def fix_xml_all_ampersand(xml_str): def fix_xml_ampersands(xml_str):
"""Replace all the '&' by '&' in XML""" """Replace all the '&' by '&' in XML"""
return xml_str.replace(u'&', u'&') return re.sub(
r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
u'&',
xml_str)
def setproctitle(title): def setproctitle(title):