Fixed to be compatible for 2 types of youtubes HTML/CSS code

2025-11-05 18:24:47 +00:00 · 2017-09-05 12:22:56 -07:00 · 2017-09-05 12:22:56 -07:00 · a4915cf85d
parent 13c71da9c4
commit a4915cf85d
2 changed files with 48 additions and 10 deletions
--- a/irs/ripper.py
+++ b/irs/ripper.py
@ -18,7 +18,7 @@ from spotipy.oauth2 import SpotifyClientCredentials


 # Local utilities
-from .utils import YdlUtils, ObjManip, Config
+from .utils import YdlUtils, ObjManip, Config, CaptchaCheat
 from .metadata import Metadata
 from .metadata import find_album_and_track, parse_genre

@ -58,6 +58,9 @@ class Ripper:
            client_credentials_manager = SpotifyClientCredentials(CLIENT_ID,
                                                                  CLIENT_SECRET
                                                                  # Stupid lint
+                                                                  # and stupid
+                                                                  # long var
+                                                                  # names
                                                                  )

            self.spotify = spotipy.Spotify(
@ -130,7 +133,7 @@ class Ripper:

        return locations

-    def find_yt_url(self, song=None, artist=None, additional_search=None):
+    def find_yt_url(self, song=None, artist=None, additional_search=None, caught_by_google=False):
        if additional_search is None:
            additional_search = Config.parse_search_terms(self)
            print(str(self.args["hook-text"].get("youtube")))
@ -150,19 +153,24 @@ init, or in method arguments.")
                                 search_terms.encode('utf-8'))})
        link = "http://www.youtube.com/results?" + query_string

-        html_content = urlopen(link).read()
-        soup = BeautifulSoup(html_content, 'html.parser')  # .prettify()
+        if not caught_by_google:
+            html_content = urlopen(link).read()
+            soup = BeautifulSoup(html_content, 'html.parser')
+        else:
+            soup = BeautifulSoup(CaptchaCheat.cheat_it(link), 'html.parser')
+
+        # print(soup.prettify())

        def find_link(link):
            try:
-                if "yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-\
-sessionlink spf-link" in str(" ".join(link["class"])):
+                if "yt-simple-endpoint style-scope ytd-video-renderer" in str(" ".join(link["class"])) or \
+                   "yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link" in str(" ".join(link["class"])):
                    if "&list=" not in link["href"]:
                        return link
            except KeyError:
                pass

-        results = list(filter(None, (map(find_link, soup.find_all("a")))))
+        results = list(filter(None, map(find_link, soup.find_all("a"))))

        garbage_phrases = "cover  album  live  clean  rare version  full  full \
 album".split("  ")
@ -198,10 +206,13 @@ album".split("  ")
        if self.code is None:
            if additional_search == "lyrics":
                return self.find_yt_url(song, artist, "")
-            else:
-                self.code = results[0]

-        return ("https://youtube.com" + self.code["href"], self.code["title"])
+        try:
+            return ("https://youtube.com" + self.code["href"], self.code["title"])
+        except TypeError:
+            # Assuming Google catches you trying to search youtube for music ;)
+            return self.find_yt_url(song=song, artist=artist, additional_search=additional_search, caught_by_google=True)
+

    def album(self, title, artist=None):  # Alias for spotify_list("album", ..)
        return self.spotify_list("album", title=title, artist=artist)
--- a/irs/utils.py
+++ b/irs/utils.py
@ -442,3 +442,30 @@ class Config:
                                 where="post_processors")
        else:
            return True
+
+
+#==============
+# Captcha Cheat
+#==============
+# I basically consider myself a genius for this snippet.
+
+from splinter import Browser
+from time import sleep
+
+
+@staticmethods
+class CaptchaCheat:
+    def cheat_it(url, t=1):
+        executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
+        with Browser('chrome', **executable_path) as b:
+            b.visit(url)
+            sleep(t)
+            while CaptchaCheat.strip_it(b.evaluate_script("document.URL")) != CaptchaCheat.strip_it(url):
+                sleep(t)
+            return b.evaluate_script("document.getElementsByTagName('html')[0].innerHTML")
+
+    def strip_it(s):
+        s = s.encode("utf-8")
+        s = s.strip("http://")
+        s = s.strip("https://")
+        return s