fixed youtube search ranking algo

2025-08-25 01:01:00 +00:00 · 2019-06-14 08:22:38 -07:00 · 2019-06-14 08:22:38 -07:00 · efff04947c
parent ba7400819b
commit efff04947c
2 changed files with 64 additions and 8 deletions
--- a/src/interact/ripper.cr
+++ b/src/interact/ripper.cr
@ -17,6 +17,8 @@ module Ripper
    # remove the extension that will be added on by ydl
    output_filename = output_filename.split(".")[..-2].join(".")
    # TODO: update the logger for this. Explore overwriting stdout and 
    # injecting/removing text
    options = {
      "--output" => %("#{output_filename}.%(ext)s"), # auto-add correct ext
      # "--quiet" => "",
--- a/src/search/youtube.cr
+++ b/src/search/youtube.cr
@ -11,6 +11,16 @@ module Youtube
    "yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink      spf-link "
  ]
  GARBAGE_PHRASES = [
    "cover", "album", "live", "clean", "version", "full", "full album", "row",
    "at", "@", "session", "how to", "npr music", "reimagined", "hr version",
    "trailer"
  ]
  GOLDEN_PHRASES = [
    "official video", "official music video"
  ]
  # Finds a youtube url based off of the given information.
  # The query to youtube is constructed like this:
  #   "<song_name> <artist_name> <search terms>"
@ -38,16 +48,16 @@ module Youtube
    return root + valid_nodes[0]["href"] if download_first
-    ranked = __rank_videos(song_name, artist_name, valid_nodes)
+    ranked = __rank_videos(song_name, artist_name, query, valid_nodes)
-    return root + valid_nodes[ranked[0][1]]["href"]
+    return root + valid_nodes[ranked[0]["index"]]["href"]
  end
  # Will rank videos according to their title and the user input
  # Returns an `Array` of Arrays each layed out like 
  # [<points>, <original index>].
-  private def __rank_videos(song_name, artist_name, nodes : Array(XML::Node))
+  private def __rank_videos(song_name : String, artist_name : String, query : String, nodes : Array(XML::Node))
-    points = [] of Array(Int32)
+    points = [] of Hash(String, Int32)
    index = 0
    nodes.each do |node|
@ -55,12 +65,24 @@ module Youtube
      pts += __points_compare(song_name, node["title"])
      pts += __points_compare(artist_name, node["title"])
      pts += __count_buzzphrases(query, node["title"])
-      points.push([pts, index])
+      points.push({
        "points" => pts,
        "index" => index
      })
      index += 1
    end
-    points.sort!{ |a, b| b[0] <=> a[0] }
+    # Sort first by points and then by original index of the song
    points.sort!{ |a, b| 
      if b["points"] == a["points"]
        a["index"] <=> b["index"]
      else
        b["points"] <=> a["points"]
      end
    }
    return points
  end
@ -73,20 +95,52 @@ module Youtube
  #   return 1 pts.
  # Else, return 0 pts.
  private def __points_compare(item1 : String, item2 : String)
-    if item1.includes?(item2)
+    if item2.includes?(item1)
      return 3
    end
    item1 = item1.downcase.gsub(/[^a-z0-9]/, "")
    item2 = item2.downcase.gsub(/[^a-z0-9]/, "")
-    if item1.includes?(item2)
+    if item2.includes?(item1)
      return 1
    else
      return 0
    end
  end
  # Checks if there are any phrases in the title of the video that would 
  # indicate audio having what we want.
  # *video_name* is the title of the video, and *query* is what the user the
  # program searched for. *query* is needed in order to make sure we're not 
  # subtracting points from something that's naturally in the title
  private def __count_buzzphrases(query : String, video_name : String)
    good_phrases = 0
    bad_phrases = 0
    GOLDEN_PHRASES.each do |gold_phrase|
      gold_phrase = gold_phrase.downcase.gsub(/[^a-z0-9]/, "")
      if query.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
        next
      elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
        bad_phrases += 1
      end
    end
    GARBAGE_PHRASES.each do |garbage_phrase|
      garbage_phrase = garbage_phrase.downcase.gsub(/[^a-z0-9]/, "")
      if query.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
        next
      elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
        bad_phrases += 1
      end
    end
    return good_phrases - bad_phrases
  end
  # Finds valid video links from a `HTTP::Client.get` request
  # Returns an `Array` of `XML::Node`
  private def __get_video_link_nodes(doc : String)