mirror of
https://github.com/cooperhammond/irs.git
synced 2024-12-22 17:35:28 +00:00
rewrote youtube link finder
This commit is contained in:
parent
13800ad087
commit
1bbea0086b
203
r&d/youtube_search.py
Normal file
203
r&d/youtube_search.py
Normal file
|
@ -0,0 +1,203 @@
|
|||
import sys
|
||||
import re
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
from urllib.parse import urlencode
|
||||
from urllib.request import urlopen
|
||||
elif sys.version_info[0] < 3:
|
||||
from urllib import urlencode
|
||||
from urllib import urlopen
|
||||
else:
|
||||
print("Must be using Python 2 or 3")
|
||||
sys.exit(1)
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def find_url(args):
|
||||
"""Finds the youtube video url for the requested song. The youtube
|
||||
query is constructed like this:
|
||||
"<song> <artist> <search terms>"
|
||||
so plugging in "Bohemian Rhapsody", "Queen", and "lyrics" would end
|
||||
up with a search for "Bohemian Rhapsody Queen lyrics" on youtube
|
||||
:param args: A dictionary with the following possible arguments
|
||||
:param-required song: song name
|
||||
:param-required artist: artist name
|
||||
:param search_terms: any additional search terms you may want to
|
||||
add to the search query
|
||||
:param first: a boolean, if true, just returns the first youtube
|
||||
search result
|
||||
:param caught_by_google: a boolean, if not false or none, turns on
|
||||
the captcha catcher
|
||||
:param download_first: a boolean, if true, downloads first video
|
||||
that youtube returnssearch query
|
||||
:rtype: A string of the youtube url for the song
|
||||
"""
|
||||
args = args if type(args) == dict else {}
|
||||
|
||||
song_title = args.get("song")
|
||||
artist_name = args.get("artist")
|
||||
search_terms = args.get("search_terms")
|
||||
|
||||
first = args.get("first")
|
||||
caught_by_google = args.get("caught_by_google")
|
||||
download_first = args.get("download_first")
|
||||
|
||||
total_search_tries = args.get("total_search_tries")
|
||||
|
||||
query = artist_name + " " + song_title
|
||||
if search_terms:
|
||||
query += " " + search_terms
|
||||
|
||||
encoded_query = urlencode({"search_query": query})
|
||||
|
||||
url = "http://www.youtube.com/results?" + encoded_query
|
||||
|
||||
soup = _get_url_data(url, caught_by_google)
|
||||
|
||||
# if you want to inspect the html being requested
|
||||
# print(soup.prettify())
|
||||
# with open("index.html", "wb") as f:
|
||||
# f.write(soup.prettify().encode('utf-8'))
|
||||
|
||||
# Each of the tags in the results list have the following relevant
|
||||
# attributes:
|
||||
# "title": the title of the youtube video
|
||||
# "href": the youtube video code, namely the X's of
|
||||
# https://www.youtube.com/watch?v=XXXXXXXXXXX
|
||||
# "class": the classes of the link, used to identify the youtube title
|
||||
results = _find_links(soup)
|
||||
|
||||
best_guess = None
|
||||
total_tries_counter = 0
|
||||
|
||||
if len(results) <= 0:
|
||||
raise Exception('There were no search results for "{}"'.format(query))
|
||||
|
||||
if first == True:
|
||||
return "https://youtube.com" + results[0]["href"]
|
||||
|
||||
scores = []
|
||||
|
||||
for index, link in enumerate(results):
|
||||
scores.append([
|
||||
index,
|
||||
_score_song(song_title, artist_name, link["title"]),
|
||||
link["href"]
|
||||
])
|
||||
|
||||
# sort by the score of the song
|
||||
sorted(scores, key=lambda x: x[1])
|
||||
|
||||
return "https://youtube.com" + results[scores[0][0]]["href"]
|
||||
|
||||
|
||||
def _score_song(song_title, artist_name, video_title):
|
||||
"""Scores the likelihood of the song audio being in the video based off of
|
||||
the video title.
|
||||
:param song_title: a string, the title of the song that you're looking for
|
||||
:param video_title: a string, the title of the video you're analyzing
|
||||
:rtype: an integer, the score of the song
|
||||
"""
|
||||
points = 0
|
||||
|
||||
song_title = _simplify(song_title)
|
||||
artist_name = _simplify(artist_name)
|
||||
video_title = _simplify(video_title)
|
||||
|
||||
if song_title in video_title:
|
||||
points += 3
|
||||
|
||||
if artist_name in video_title:
|
||||
points += 3
|
||||
|
||||
points -= _count_garbage_phrases(video_title, song_title)
|
||||
|
||||
return points
|
||||
|
||||
|
||||
def _simplify(string):
|
||||
"""Lowercases and strips all non alphanumeric characters from the string
|
||||
:param string: a string to be modified
|
||||
:rtype: the modified string
|
||||
"""
|
||||
return re.sub(r'[^a-zA-Z0-9]+', '', string)
|
||||
|
||||
|
||||
def _count_garbage_phrases(video_title, song_title):
|
||||
"""Checks if there are any phrases in the title of the video that would
|
||||
indicate it doesn't have the audio we want
|
||||
:param string: a string, the youtube video title
|
||||
:param title: a string, the actual title of the song we're looking for
|
||||
:rtype: an integer, of the number of bad phrases in the song
|
||||
"""
|
||||
|
||||
# Garbage phrases found through experiences of downloading the wrong song
|
||||
# TODO: add this into the config so the user can mess with it if they want
|
||||
garbage_phrases = (
|
||||
"cover album live clean rare version full full album row at "
|
||||
"@ session how to npr music reimagined hr version"
|
||||
).split(" ")
|
||||
|
||||
bad_phrases = 0
|
||||
|
||||
for gphrase in garbage_phrases:
|
||||
# make sure we're not invalidating part of the title of the song
|
||||
if gphrase in song_title.lower():
|
||||
continue
|
||||
|
||||
# check if the garbage phrase is not in the video title
|
||||
if gphrase in video_title.lower():
|
||||
bad_phrases += 1
|
||||
|
||||
return bad_phrases
|
||||
|
||||
|
||||
def _find_links(soup):
|
||||
"""Finds youtube video links in the html soup
|
||||
:param soup: a BeautifulSoup(...) element
|
||||
:rtype: returns a list of valid youtube video links
|
||||
"""
|
||||
return list(filter(None, map(_find_link, soup.find_all("a"))))
|
||||
|
||||
|
||||
def _find_link(link):
|
||||
"""Tests html tags to see if they contain a youtube video link.
|
||||
Should be used only with the find_links function in a map func.
|
||||
:param link: accepts an element from BeautifulSoup(...).find_all(...)
|
||||
:rtype: returns the link if it's an actual video link, otherwise, None
|
||||
"""
|
||||
try:
|
||||
class_to_check = str(" ".join(link["class"]))
|
||||
except KeyError:
|
||||
return
|
||||
|
||||
# these classes are found by inspecting the html soup of a youtube search.
|
||||
valid_classes = [
|
||||
"yt-simple-endpoint style-scope ytd-video-renderer",
|
||||
("yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 "
|
||||
"yt-uix-sessionlink spf-link ")
|
||||
]
|
||||
|
||||
try:
|
||||
# Make sure it's not a playlist
|
||||
if "&list=" in link["href"]:
|
||||
return
|
||||
|
||||
for valid_class in valid_classes:
|
||||
if valid_class in class_to_check:
|
||||
return link
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
|
||||
# TODO: build in the captcha cheater if the user is "caught" by google
|
||||
def _get_url_data(url, caught_by_google):
|
||||
"""Gets parsed html from the specified url
|
||||
:param url: A string, the url to request and parse.
|
||||
:param caught_by_google: A boolean, will open and use the captcha
|
||||
cheat to get around google's captcha.
|
||||
:rtype: A BeautifulSoup class
|
||||
"""
|
||||
html_content = urlopen(url).read()
|
||||
return BeautifulSoup(html_content, 'html.parser')
|
Loading…
Reference in a new issue