Merge pull request #77 from cooperhammond/search-improvement

Search improvement based on song duration
2025-10-16 10:37:02 +00:00 · 2021-04-15 09:45:27 -06:00 · 2021-04-15 09:45:27 -06:00 · 92e8885ae9
parent 3f12a880e9 5eaac33345
commit 92e8885ae9
5 changed files with 273 additions and 200 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,3 +12,4 @@
 ffmpeg
 ffprobe
 youtube-dl
+*.temp
--- a/src/glue/mapper.cr
+++ b/src/glue/mapper.cr
@ -46,6 +46,7 @@ class TrackMapper
      type: Int32,
      setter: true
    },
+    duration_ms: Int32,
    type: String,
    uri: String
  )
--- a/src/glue/song.cr
+++ b/src/glue/song.cr
@ -85,6 +85,8 @@ class Song
    end

    data = @metadata.as(JSON::Any)
+    @song_name = data["name"].as_s
+    @artist_name = data["artists"][0]["name"].as_s
    @filename = "#{Pattern.parse(Config.filename_pattern, data)}.mp3"

    if ask_url
@ -97,7 +99,7 @@ class Song

    if !url
      outputter("url", 0)
-      url = Youtube.find_url(@song_name, @artist_name, search_terms: "lyrics")
+      url = Youtube.find_url(data, search_terms: "lyrics")
      if !url
        raise("There was no url found on youtube for " +
              %("#{@song_name}" by "#{@artist_name}. ) +
@ -119,29 +121,29 @@ class Song

    outputter("albumart", 0)
    temp_albumart_filename = ".tempalbumart.jpg"
-    HTTP::Client.get(data["album"]["images"][0]["url"].to_s) do |response|
+    HTTP::Client.get(data["album"]["images"][0]["url"].as_s) do |response|
      File.write(temp_albumart_filename, response.body_io)
    end
    outputter("albumart", 0)

    # check if song's metadata has been modded in playlist, update artist accordingly
    if data["artists"][-1]["owner"]? 
-      @artist = data["artists"][-1]["name"].to_s
+      @artist = data["artists"][-1]["name"].as_s
    else
-      @artist = data["artists"][0]["name"].to_s
+      @artist = data["artists"][0]["name"].as_s
    end
-    @album = data["album"]["name"].to_s
+    @album = data["album"]["name"].as_s

    tagger = Tags.new(@filename)
    tagger.add_album_art(temp_albumart_filename)
-    tagger.add_text_tag("title", data["name"].to_s)
+    tagger.add_text_tag("title", data["name"].as_s)
    tagger.add_text_tag("artist", @artist)

    if !@album.empty?
      tagger.add_text_tag("album", @album)
    end

-    if genre = @spotify_searcher.find_genre(data["artists"][0]["id"].to_s)
+    if genre = @spotify_searcher.find_genre(data["artists"][0]["id"].as_s)
      tagger.add_text_tag("genre", genre)
    end

--- a/src/search/ranking.cr
+++ b/src/search/ranking.cr
@ -0,0 +1,144 @@
+alias VID_VALUE_CLASS = String
+alias VID_METADATA_CLASS = Hash(String, VID_VALUE_CLASS)
+alias YT_METADATA_CLASS = Array(VID_METADATA_CLASS)
+
+module Ranker
+  extend self
+
+  GARBAGE_PHRASES = [
+    "cover", "album", "live", "clean", "version", "full", "full album", "row",
+    "at", "@", "session", "how to", "npr music", "reimagined", "version",
+    "trailer"
+  ]
+
+  GOLDEN_PHRASES = [
+    "official video", "official music video",
+  ]
+
+  # Will rank videos according to their title and the user input, returns a sorted array of hashes
+  # of the points a song was assigned and its original index
+  # *spotify_metadata* is the metadate (from spotify) of the song that you want
+  # *yt_metadata* is an array of hashes with metadata scraped from the youtube search result page
+  # *query* is the query that you submitted to youtube for the results you now have
+  # ```
+  # Ranker.rank_videos(spotify_metadata, yt_metadata, query)
+  # => [
+  #      {"points" => x, "index" => x},
+  #      ...
+  #    ]
+  # ```
+  # "index" corresponds to the original index of the song in yt_metadata
+  def rank_videos(spotify_metadata : JSON::Any, yt_metadata : YT_METADATA_CLASS,
+                  query : String) : Array(Hash(String, Int32))
+    points = [] of Hash(String, Int32)
+    index = 0
+
+    actual_song_name = spotify_metadata["name"].as_s
+    actual_artist_name = spotify_metadata["artists"][0]["name"].as_s
+
+    yt_metadata.each do |vid|
+      pts = 0
+
+      pts += points_string_compare(actual_song_name, vid["title"])
+      pts += points_string_compare(actual_artist_name, vid["title"])
+      pts += count_buzzphrases(query, vid["title"])
+      pts += compare_timestamps(spotify_metadata, vid)
+
+      points.push({
+        "points" => pts,
+        "index"  => index,
+      })
+      index += 1
+    end
+
+    # Sort first by points and then by original index of the song
+    points.sort! { |a, b|
+      if b["points"] == a["points"]
+        a["index"] <=> b["index"]
+      else
+        b["points"] <=> a["points"]
+      end
+    }
+
+    return points
+  end
+
+  # SINGULAR COMPONENT OF RANKING ALGORITHM
+  private def compare_timestamps(spotify_metadata : JSON::Any, node : VID_METADATA_CLASS) : Int32
+    # puts spotify_metadata.to_pretty_json()
+    actual_time = spotify_metadata["duration_ms"].as_i
+    vid_time = node["duration_ms"].to_i
+
+    difference = (actual_time - vid_time).abs 
+
+    # puts "actual: #{actual_time}, vid: #{vid_time}"
+    # puts "\tdiff: #{difference}"
+    # puts "\ttitle: #{node["title"]}"
+
+    if difference <= 1000
+      return 3
+    elsif difference <= 2000
+      return 2
+    elsif difference <= 5000
+      return 1
+    else 
+      return 0
+    end
+  end
+
+  # SINGULAR COMPONENT OF RANKING ALGORITHM
+  # Returns an `Int` based off the number of points worth assigning to the
+  # matchiness of the string. First the strings are downcased and then all
+  # nonalphanumeric characters are stripped.
+  # If *item1* includes *item2*, return 3 pts.
+  # If after the items have been blanked, *item1* includes *item2*,
+  #   return 1 pts.
+  # Else, return 0 pts.
+  private def points_string_compare(item1 : String, item2 : String) : Int32
+    if item2.includes?(item1)
+      return 3
+    end
+
+    item1 = item1.downcase.gsub(/[^a-z0-9]/, "")
+    item2 = item2.downcase.gsub(/[^a-z0-9]/, "")
+
+    if item2.includes?(item1)
+      return 1
+    else
+      return 0
+    end
+  end
+
+  # SINGULAR COMPONENT OF RANKING ALGORITHM
+  # Checks if there are any phrases in the title of the video that would
+  # indicate audio having what we want.
+  # *video_name* is the title of the video, and *query* is what the user the
+  # program searched for. *query* is needed in order to make sure we're not
+  # subtracting points from something that's naturally in the title
+  private def count_buzzphrases(query : String, video_name : String) : Int32
+    good_phrases = 0
+    bad_phrases = 0
+
+    GOLDEN_PHRASES.each do |gold_phrase|
+      gold_phrase = gold_phrase.downcase.gsub(/[^a-z0-9]/, "")
+
+      if query.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
+        next
+      elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
+        good_phrases += 1
+      end
+    end
+
+    GARBAGE_PHRASES.each do |garbage_phrase|
+      garbage_phrase = garbage_phrase.downcase.gsub(/[^a-z0-9]/, "")
+
+      if query.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
+        next
+      elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
+        bad_phrases += 1
+      end
+    end
+
+    return good_phrases - bad_phrases
+  end
+end
--- a/src/search/youtube.cr
+++ b/src/search/youtube.cr
@ -3,6 +3,8 @@ require "xml"
 require "json"
 require "uri"

+require "./ranking"
+

 module Youtube
  extend self
@ -12,17 +14,122 @@ module Youtube
    "yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink      spf-link ",
  ]

-  GARBAGE_PHRASES = [
-    "cover", "album", "live", "clean", "version", "full", "full album", "row",
-    "at", "@", "session", "how to", "npr music", "reimagined", "hr version",
-    "trailer",
-  ]
+  # Note that VID_VALUE_CLASS, VID_METADATA_CLASS, and YT_METADATA_CLASS are found in ranking.cr

-  GOLDEN_PHRASES = [
-    "official video", "official music video",
-  ]
+  # Finds a youtube url based off of the given information.
+  # The query to youtube is constructed like this:
+  #   "<song_name> <artist_name> <search terms>"
+  # If *download_first* is provided, the first link found will be downloaded.
+  # If *select_link* is provided, a menu of options will be shown for the user to choose their poison
+  #
+  # ```
+  # Youtube.find_url("Bohemian Rhapsody", "Queen")
+  # => "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+  # ```
+  def find_url(spotify_metadata : JSON::Any, search_terms = "",
+               download_first = false, select_link = false) : String?

-  alias NODES_CLASS = Array(Hash(String, String))
+    song_name = spotify_metadata["name"].as_s
+    artist_name = spotify_metadata["artists"][0]["name"].as_s
+
+    human_query = song_name + " " + artist_name + " " + search_terms.strip
+    url_query = human_query.gsub(" ", "+")
+
+    url = "https://www.youtube.com/results?search_query=" + url_query
+
+    response = HTTP::Client.get(url)
+
+    yt_metadata = get_yt_search_metadata(response.body)
+
+    if yt_metadata.size == 0
+      puts "There were no results for this query on youtube: \"#{human_query}\""
+      return nil
+    end
+
+    root = "https://youtube.com"
+
+    if download_first
+      return root + yt_metadata[0]["href"] 
+    end
+
+    if select_link
+      # return select_link_menu()
+    end
+
+    ranked = Ranker.rank_videos(spotify_metadata, yt_metadata, human_query)
+
+    begin
+      return root + yt_metadata[ranked[0]["index"]]["href"]
+    rescue IndexError
+      return nil
+    end
+
+    exit 1
+  end
+  
+  #
+  private def select_link_menu() : String
+
+  end
+
+  # Finds valid video links from a `HTTP::Client.get` request 
+  # Returns an `Array` of `NODES_CLASS` containing additional metadata from Youtube
+  private def get_yt_search_metadata(response_body : String) : YT_METADATA_CLASS
+    yt_initial_data : JSON::Any = JSON.parse("{}")
+
+    response_body.each_line do |line|
+      # timestamp 11/8/2020:
+      # youtube's html page has a line previous to this literally with 'scraper_data_begin' as a comment
+      if line.includes?("var ytInitialData")
+        # Extract JSON data from line
+        data = line.split(" = ")[2].delete(';')
+        dataEnd = (data.index("</script>") || 0) - 1
+
+        begin
+          yt_initial_data = JSON.parse(data[0..dataEnd])
+        rescue
+          break
+        end
+      end
+    end
+
+    if yt_initial_data == JSON.parse("{}")
+      puts "Youtube has changed the way it organizes its webpage, submit a bug"
+      puts "saying it has done so on https://github.com/cooperhammond/irs"
+      exit(1)
+    end
+
+    # where the vid metadata lives
+    yt_initial_data = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"]
+
+    video_metadata = [] of VID_METADATA_CLASS
+
+    i = 0
+    while true
+      begin
+        # video title
+        raw_metadata = yt_initial_data[0]["itemSectionRenderer"]["contents"][i]["videoRenderer"]
+
+        metadata = {} of String => VID_VALUE_CLASS
+
+        metadata["title"] = raw_metadata["title"]["runs"][0]["text"].as_s
+        metadata["href"] = raw_metadata["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"].as_s
+        timestamp = raw_metadata["lengthText"]["simpleText"].as_s
+        metadata["timestamp"] = timestamp
+        metadata["duration_ms"] = ((timestamp.split(":")[0].to_i * 60 +
+                               timestamp.split(":")[1].to_i) * 1000).to_s
+
+
+        video_metadata.push(metadata)
+      rescue IndexError
+        break
+      rescue Exception
+      end
+      i += 1
+    end
+
+    return video_metadata
+  end

  # Checks if the given URL is a valid youtube URL
  #
@ -62,186 +169,4 @@ module Youtube

    return response.body.includes?("status=ok")
  end
-
-  # Finds a youtube url based off of the given information.
-  # The query to youtube is constructed like this:
-  #   "<song_name> <artist_name> <search terms>"
-  # If *download_first* is provided, the first link found will be downloaded.
-  #
-  # ```
-  # Youtube.find_url("Bohemian Rhapsody", "Queen")
-  # => "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
-  # ```
-  def find_url(song_name : String, artist_name : String, search_terms = "",
-               download_first = false) : String?
-    query = (song_name + " " + artist_name + " " + search_terms).strip.gsub(" ", "+")
-
-    url = "https://www.youtube.com/results?search_query=" + query
-
-    response = HTTP::Client.get(url)
-
-    valid_nodes = get_video_link_nodes(response.body)
-
-    if valid_nodes.size == 0
-      puts "There were no results for that query."
-      return nil
-    end
-
-    root = "https://youtube.com"
-
-    return root + valid_nodes[0]["href"] if download_first
-
-    ranked = rank_videos(song_name, artist_name, query, valid_nodes)
-
-    begin
-      return root + valid_nodes[ranked[0]["index"]]["href"]
-    rescue IndexError
-      return nil
-    end
-  end
-
-  # Will rank videos according to their title and the user input
-  # Return:
-  # [
-  #   {"points" => x, "index" => x},
-  #   ...
-  # ]
-  private def rank_videos(song_name : String, artist_name : String,
-                          query : String, nodes : Array(Hash(String, String))) : Array(Hash(String, Int32))
-    points = [] of Hash(String, Int32)
-    index = 0
-
-    nodes.each do |node|
-      pts = 0
-
-      pts += points_compare(song_name, node["title"])
-      pts += points_compare(artist_name, node["title"])
-      pts += count_buzzphrases(query, node["title"])
-
-      points.push({
-        "points" => pts,
-        "index"  => index,
-      })
-      index += 1
-    end
-
-    # Sort first by points and then by original index of the song
-    points.sort! { |a, b|
-      if b["points"] == a["points"]
-        a["index"] <=> b["index"]
-      else
-        b["points"] <=> a["points"]
-      end
-    }
-
-    return points
-  end
-
-  # Returns an `Int` based off the number of points worth assigning to the
-  # matchiness of the string. First the strings are downcased and then all
-  # nonalphanumeric characters are stripped.
-  # If *item1* includes *item2*, return 3 pts.
-  # If after the items have been blanked, *item1* includes *item2*,
-  #   return 1 pts.
-  # Else, return 0 pts.
-  private def points_compare(item1 : String, item2 : String) : Int32
-    if item2.includes?(item1)
-      return 3
-    end
-
-    item1 = item1.downcase.gsub(/[^a-z0-9]/, "")
-    item2 = item2.downcase.gsub(/[^a-z0-9]/, "")
-
-    if item2.includes?(item1)
-      return 1
-    else
-      return 0
-    end
-  end
-
-  # Checks if there are any phrases in the title of the video that would
-  # indicate audio having what we want.
-  # *video_name* is the title of the video, and *query* is what the user the
-  # program searched for. *query* is needed in order to make sure we're not
-  # subtracting points from something that's naturally in the title
-  private def count_buzzphrases(query : String, video_name : String) : Int32
-    good_phrases = 0
-    bad_phrases = 0
-
-    GOLDEN_PHRASES.each do |gold_phrase|
-      gold_phrase = gold_phrase.downcase.gsub(/[^a-z0-9]/, "")
-
-      if query.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
-        next
-      elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
-        good_phrases += 1
-      end
-    end
-
-    GARBAGE_PHRASES.each do |garbage_phrase|
-      garbage_phrase = garbage_phrase.downcase.gsub(/[^a-z0-9]/, "")
-
-      if query.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
-        next
-      elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
-        bad_phrases += 1
-      end
-    end
-
-    return good_phrases - bad_phrases
-  end
-
-  # Finds valid video links from a `HTTP::Client.get` request
-  # Returns an `Array` of `XML::Node`
-  private def get_video_link_nodes(response_body : String) : NODES_CLASS
-    yt_initial_data : JSON::Any = JSON.parse("{}")
-
-    response_body.each_line do |line|
-      # timestamp 11/8/2020:
-      # youtube's html page has a line previous to this literally with 'scraper_data_begin' as a comment
-      if line.includes?("var ytInitialData")
-        # Extract JSON data from line
-        data = line.split(" = ")[2].delete(';')
-        dataEnd = (data.index("</script>") || 0) - 1
-
-        begin
-          yt_initial_data = JSON.parse(data[0..dataEnd])
-        rescue
-          break
-        end
-      end
-    end
-
-    if yt_initial_data == JSON.parse("{}")
-      puts "Youtube has changed the way it organizes its webpage, submit a bug"
-      puts "saying it has done so on https://github.com/cooperhammond/irs"
-      exit(1)
-    end
-
-    # where the vid metadata lives
-    yt_initial_data = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"]
-
-    video_metadata = [] of Hash(String, String)
-
-    i = 0
-    while true
-      begin
-        # video title
-        raw_metadata = yt_initial_data[0]["itemSectionRenderer"]["contents"][i]["videoRenderer"]
-
-        metadata = {} of String => String
-
-        metadata["title"] = raw_metadata["title"]["runs"][0]["text"].as_s
-        metadata["href"] = raw_metadata["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"].as_s
-    
-        video_metadata.push(metadata)
-      rescue IndexError
-        break
-      rescue Exception
-      end
-      i += 1
-    end
-
-    return video_metadata
-  end
 end