fixed youtube search ranking algo

2025-10-20 11:17:17 +00:00 · 2019-06-14 08:22:38 -07:00 · 2019-06-14 08:22:38 -07:00 · f82affb589
parent 253efd1e11
commit f82affb589
2 changed files with 64 additions and 8 deletions
--- a/src/interact/ripper.cr
+++ b/src/interact/ripper.cr
@ -17,6 +17,8 @@ module Ripper
    # remove the extension that will be added on by ydl
    output_filename = output_filename.split(".")[..-2].join(".")

+    # TODO: update the logger for this. Explore overwriting stdout and 
+    # injecting/removing text
    options = {
      "--output" => %("#{output_filename}.%(ext)s"), # auto-add correct ext
      # "--quiet" => "",
--- a/src/search/youtube.cr
+++ b/src/search/youtube.cr
@ -11,6 +11,16 @@ module Youtube
    "yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink      spf-link "
  ]

+  GARBAGE_PHRASES = [
+    "cover", "album", "live", "clean", "version", "full", "full album", "row",
+    "at", "@", "session", "how to", "npr music", "reimagined", "hr version",
+    "trailer"
+  ]
+
+  GOLDEN_PHRASES = [
+    "official video", "official music video"
+  ]
+
  # Finds a youtube url based off of the given information.
  # The query to youtube is constructed like this:
  #   "<song_name> <artist_name> <search terms>"
@ -38,16 +48,16 @@ module Youtube

    return root + valid_nodes[0]["href"] if download_first

-    ranked = __rank_videos(song_name, artist_name, valid_nodes)
+    ranked = __rank_videos(song_name, artist_name, query, valid_nodes)

-    return root + valid_nodes[ranked[0][1]]["href"]
+    return root + valid_nodes[ranked[0]["index"]]["href"]
  end

  # Will rank videos according to their title and the user input
  # Returns an `Array` of Arrays each layed out like 
  # [<points>, <original index>].
-  private def __rank_videos(song_name, artist_name, nodes : Array(XML::Node))
-    points = [] of Array(Int32)
+  private def __rank_videos(song_name : String, artist_name : String, query : String, nodes : Array(XML::Node))
+    points = [] of Hash(String, Int32)
    index = 0

    nodes.each do |node|
@ -55,12 +65,24 @@ module Youtube

      pts += __points_compare(song_name, node["title"])
      pts += __points_compare(artist_name, node["title"])
+      pts += __count_buzzphrases(query, node["title"])

-      points.push([pts, index])
+      points.push({
+        "points" => pts,
+        "index" => index
+      })
      index += 1
+
    end

-    points.sort!{ |a, b| b[0] <=> a[0] }
+    # Sort first by points and then by original index of the song
+    points.sort!{ |a, b| 
+      if b["points"] == a["points"]
+        a["index"] <=> b["index"]
+      else
+        b["points"] <=> a["points"]
+      end
+    }

    return points
  end
@ -73,20 +95,52 @@ module Youtube
  #   return 1 pts.
  # Else, return 0 pts.
  private def __points_compare(item1 : String, item2 : String)
-    if item1.includes?(item2)
+    if item2.includes?(item1)
      return 3
    end

    item1 = item1.downcase.gsub(/[^a-z0-9]/, "")
    item2 = item2.downcase.gsub(/[^a-z0-9]/, "")

-    if item1.includes?(item2)
+    if item2.includes?(item1)
      return 1
    else
      return 0
    end
  end

+  # Checks if there are any phrases in the title of the video that would 
+  # indicate audio having what we want.
+  # *video_name* is the title of the video, and *query* is what the user the
+  # program searched for. *query* is needed in order to make sure we're not 
+  # subtracting points from something that's naturally in the title
+  private def __count_buzzphrases(query : String, video_name : String)
+    good_phrases = 0
+    bad_phrases = 0
+
+    GOLDEN_PHRASES.each do |gold_phrase|
+      gold_phrase = gold_phrase.downcase.gsub(/[^a-z0-9]/, "")
+  
+      if query.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
+        next
+      elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
+        bad_phrases += 1
+      end
+    end
+
+    GARBAGE_PHRASES.each do |garbage_phrase|
+      garbage_phrase = garbage_phrase.downcase.gsub(/[^a-z0-9]/, "")
+
+      if query.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
+        next
+      elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
+        bad_phrases += 1
+      end
+    end
+
+    return good_phrases - bad_phrases
+  end
+
  # Finds valid video links from a `HTTP::Client.get` request
  # Returns an `Array` of `XML::Node`
  private def __get_video_link_nodes(doc : String)