rewrote youtube link finder

2025-11-04 18:24:47 +00:00 · 2019-05-04 19:03:19 -07:00 · 2019-05-04 19:03:19 -07:00 · 1bbea0086b
parent 13800ad087
commit 1bbea0086b
1 changed files with 203 additions and 0 deletions
--- a/r&d/youtube_search.py
+++ b/r&d/youtube_search.py
@ -0,0 +1,203 @@
+import sys
+import re
+
+if sys.version_info[0] >= 3:
+    from urllib.parse import urlencode
+    from urllib.request import urlopen
+elif sys.version_info[0] < 3:
+    from urllib import urlencode
+    from urllib import urlopen
+else:
+    print("Must be using Python 2 or 3")
+    sys.exit(1)
+
+from bs4 import BeautifulSoup
+
+
+def find_url(args):
+    """Finds the youtube video url for the requested song. The youtube 
+        query is constructed like this:
+        "<song> <artist> <search terms>"
+        so plugging in "Bohemian Rhapsody", "Queen", and "lyrics" would end
+        up with a search for "Bohemian Rhapsody Queen lyrics" on youtube
+    :param args: A dictionary with the following possible arguments
+        :param-required song: song name
+        :param-required artist: artist name
+        :param search_terms: any additional search terms you may want to
+            add to the search query
+        :param first: a boolean, if true, just returns the first youtube 
+            search result
+        :param caught_by_google: a boolean, if not false or none, turns on
+            the captcha catcher
+        :param download_first: a boolean, if true, downloads first video
+            that youtube returnssearch query
+    :rtype: A string of the youtube url for the song
+    """
+    args = args if type(args) == dict else {}
+
+    song_title = args.get("song")
+    artist_name = args.get("artist")
+    search_terms = args.get("search_terms")
+
+    first = args.get("first")
+    caught_by_google = args.get("caught_by_google")
+    download_first = args.get("download_first")
+
+    total_search_tries = args.get("total_search_tries")
+
+    query = artist_name + " " + song_title
+    if search_terms:
+        query += " " + search_terms
+
+    encoded_query = urlencode({"search_query": query})
+
+    url = "http://www.youtube.com/results?" + encoded_query
+
+    soup = _get_url_data(url, caught_by_google)
+
+    # if you want to inspect the html being requested
+    # print(soup.prettify())
+    # with open("index.html", "wb") as f:
+    #     f.write(soup.prettify().encode('utf-8'))
+
+    # Each of the tags in the results list have the following relevant 
+    # attributes:
+    #   "title": the title of the youtube video
+    #   "href": the youtube video code, namely the X's of 
+    #       https://www.youtube.com/watch?v=XXXXXXXXXXX
+    #   "class": the classes of the link, used to identify the youtube title
+    results = _find_links(soup)
+
+    best_guess = None
+    total_tries_counter = 0
+
+    if len(results) <= 0:
+        raise Exception('There were no search results for "{}"'.format(query))
+
+    if first == True:
+        return "https://youtube.com" + results[0]["href"]
+
+    scores = []
+
+    for index, link in enumerate(results):
+        scores.append([
+            index, 
+            _score_song(song_title, artist_name, link["title"]),
+            link["href"]
+        ])
+
+    # sort by the score of the song
+    sorted(scores, key=lambda x: x[1])
+
+    return "https://youtube.com" + results[scores[0][0]]["href"]
+
+
+def _score_song(song_title, artist_name, video_title):
+    """Scores the likelihood of the song audio being in the video based off of
+        the video title.
+    :param song_title: a string, the title of the song that you're looking for
+    :param video_title: a string, the title of the video you're analyzing
+    :rtype: an integer, the score of the song
+    """
+    points = 0
+
+    song_title = _simplify(song_title)
+    artist_name = _simplify(artist_name)
+    video_title = _simplify(video_title)
+
+    if song_title in video_title:
+        points += 3
+
+    if artist_name in video_title:
+        points += 3
+
+    points -= _count_garbage_phrases(video_title, song_title)
+
+    return points
+    
+
+def _simplify(string):
+    """Lowercases and strips all non alphanumeric characters from the string
+    :param string: a string to be modified
+    :rtype: the modified string
+    """
+    return re.sub(r'[^a-zA-Z0-9]+', '', string)
+
+
+def _count_garbage_phrases(video_title, song_title):
+    """Checks if there are any phrases in the title of the video that would 
+        indicate it doesn't have the audio we want
+    :param string: a string, the youtube video title
+    :param title: a string, the actual title of the song we're looking for
+    :rtype: an integer, of the number of bad phrases in the song
+    """
+
+    # Garbage phrases found through experiences of downloading the wrong song
+    # TODO: add this into the config so the user can mess with it if they want
+    garbage_phrases = (
+        "cover  album  live  clean  rare version  full  full album  row  at  "
+        "@  session  how to  npr music  reimagined  hr version"
+    ).split("  ")
+
+    bad_phrases = 0
+
+    for gphrase in garbage_phrases:
+        # make sure we're not invalidating part of the title of the song
+        if gphrase in song_title.lower():
+            continue
+        
+        # check if the garbage phrase is not in the video title
+        if gphrase in video_title.lower():
+            bad_phrases += 1
+    
+    return bad_phrases
+
+
+def _find_links(soup):
+    """Finds youtube video links in the html soup
+    :param soup: a BeautifulSoup(...) element
+    :rtype: returns a list of valid youtube video links
+    """
+    return list(filter(None, map(_find_link, soup.find_all("a"))))
+
+
+def _find_link(link):
+    """Tests html tags to see if they contain a youtube video link.
+        Should be used only with the find_links function in a map func.
+    :param link: accepts an element from BeautifulSoup(...).find_all(...)
+    :rtype: returns the link if it's an actual video link, otherwise, None
+    """
+    try:
+        class_to_check = str(" ".join(link["class"]))
+    except KeyError:
+        return
+
+    # these classes are found by inspecting the html soup of a youtube search.
+    valid_classes = [
+        "yt-simple-endpoint style-scope ytd-video-renderer",
+        ("yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 "
+         "yt-uix-sessionlink spf-link ")
+    ]
+
+    try:
+        # Make sure it's not a playlist
+        if "&list=" in link["href"]:
+            return
+
+        for valid_class in valid_classes:
+            if valid_class in class_to_check:
+                return link
+    except KeyError:
+        pass
+
+
+# TODO: build in the captcha cheater if the user is "caught" by google
+def _get_url_data(url, caught_by_google):
+    """Gets parsed html from the specified url
+    :param url: A string, the url to request and parse.
+    :param caught_by_google: A boolean, will open and use the captcha 
+        cheat to get around google's captcha.
+    :rtype: A BeautifulSoup class
+    """
+    html_content = urlopen(url).read()
+    return BeautifulSoup(html_content, 'html.parser')