diff --git a/.gitignore b/.gitignore index 463117c..07d536c 100755 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ .ripper.log ffmpeg ffprobe -youtube-dl \ No newline at end of file +youtube-dl +*.temp \ No newline at end of file diff --git a/src/glue/song.cr b/src/glue/song.cr index 0de193f..07c5697 100755 --- a/src/glue/song.cr +++ b/src/glue/song.cr @@ -85,6 +85,8 @@ class Song end data = @metadata.as(JSON::Any) + @song_name = data["name"].as_s + @artist_name = data["artists"][0]["name"].as_s @filename = "#{Pattern.parse(Config.filename_pattern, data)}.mp3" if ask_url @@ -97,7 +99,7 @@ class Song if !url outputter("url", 0) - url = Youtube.find_url(@song_name, @artist_name, search_terms: "lyrics") + url = Youtube.find_url(data, search_terms: "lyrics") if !url raise("There was no url found on youtube for " + %("#{@song_name}" by "#{@artist_name}. ) + @@ -119,29 +121,29 @@ class Song outputter("albumart", 0) temp_albumart_filename = ".tempalbumart.jpg" - HTTP::Client.get(data["album"]["images"][0]["url"].to_s) do |response| + HTTP::Client.get(data["album"]["images"][0]["url"].as_s) do |response| File.write(temp_albumart_filename, response.body_io) end outputter("albumart", 0) # check if song's metadata has been modded in playlist, update artist accordingly if data["artists"][-1]["owner"]? - @artist = data["artists"][-1]["name"].to_s + @artist = data["artists"][-1]["name"].as_s else - @artist = data["artists"][0]["name"].to_s + @artist = data["artists"][0]["name"].as_s end - @album = data["album"]["name"].to_s + @album = data["album"]["name"].as_s tagger = Tags.new(@filename) tagger.add_album_art(temp_albumart_filename) - tagger.add_text_tag("title", data["name"].to_s) + tagger.add_text_tag("title", data["name"].as_s) tagger.add_text_tag("artist", @artist) if !@album.empty? tagger.add_text_tag("album", @album) end - if genre = @spotify_searcher.find_genre(data["artists"][0]["id"].to_s) + if genre = @spotify_searcher.find_genre(data["artists"][0]["id"].as_s) tagger.add_text_tag("genre", genre) end diff --git a/src/search/ranking.cr b/src/search/ranking.cr new file mode 100644 index 0000000..4f0909c --- /dev/null +++ b/src/search/ranking.cr @@ -0,0 +1,143 @@ +alias VID_VALUE_CLASS = String +alias VID_METADATA_CLASS = Hash(String, VID_VALUE_CLASS) +alias YT_METADATA_CLASS = Array(VID_METADATA_CLASS) + +module Ranker + extend self + + GARBAGE_PHRASES = [ + "cover", "album", "live", "clean", "version", "full", "full album", "row", + "at", "@", "session", "how to", "npr music", "reimagined", "version", + "trailer" + ] + + GOLDEN_PHRASES = [ + "official video", "official music video", + ] + + # Will rank videos according to their title and the user input, returns a sorted array of hashes + # of the points a song was assigned and its original index + # *spotify_metadata* is the metadate (from spotify) of the song that you want + # *yt_metadata* is an array of hashes with metadata scraped from the youtube search result page + # *query* is the query that you submitted to youtube for the results you now have + # ``` + # Ranker.rank_videos(spotify_metadata, yt_metadata, query) + # => [ + # {"points" => x, "index" => x}, + # ... + # ] + # ``` + # "index" corresponds to the original index of the song in yt_metadata + def rank_videos(spotify_metadata : JSON::Any, yt_metadata : YT_METADATA_CLASS, + query : String) : Array(Hash(String, Int32)) + points = [] of Hash(String, Int32) + index = 0 + + actual_song_name = spotify_metadata["name"].as_s + actual_artist_name = spotify_metadata["artists"][0]["name"].as_s + + yt_metadata.each do |vid| + pts = 0 + + pts += points_string_compare(actual_song_name, vid["title"]) + pts += points_string_compare(actual_artist_name, vid["title"]) + pts += count_buzzphrases(query, vid["title"]) + pts += compare_timestamps(spotify_metadata, vid) + + points.push({ + "points" => pts, + "index" => index, + }) + index += 1 + end + + # Sort first by points and then by original index of the song + points.sort! { |a, b| + if b["points"] == a["points"] + a["index"] <=> b["index"] + else + b["points"] <=> a["points"] + end + } + + return points + end + + # SINGULAR COMPONENT OF RANKING ALGORITHM + private def compare_timestamps(spotify_metadata : JSON::Any, node : VID_METADATA_CLASS) : Int32 + actual_time = spotify_metadata["duration_ms"].as_i + vid_time = node["duration_ms"].to_i + + difference = (actual_time - vid_time).abs + + # puts "actual: #{actual_time}, vid: #{vid_time}" + # puts "\tdiff: #{difference}" + # puts "\ttitle: #{node["title"]}" + + if difference <= 1000 + return 3 + elsif difference <= 2000 + return 2 + elsif difference <= 5000 + return 1 + else + return 0 + end + end + + # SINGULAR COMPONENT OF RANKING ALGORITHM + # Returns an `Int` based off the number of points worth assigning to the + # matchiness of the string. First the strings are downcased and then all + # nonalphanumeric characters are stripped. + # If *item1* includes *item2*, return 3 pts. + # If after the items have been blanked, *item1* includes *item2*, + # return 1 pts. + # Else, return 0 pts. + private def points_string_compare(item1 : String, item2 : String) : Int32 + if item2.includes?(item1) + return 3 + end + + item1 = item1.downcase.gsub(/[^a-z0-9]/, "") + item2 = item2.downcase.gsub(/[^a-z0-9]/, "") + + if item2.includes?(item1) + return 1 + else + return 0 + end + end + + # SINGULAR COMPONENT OF RANKING ALGORITHM + # Checks if there are any phrases in the title of the video that would + # indicate audio having what we want. + # *video_name* is the title of the video, and *query* is what the user the + # program searched for. *query* is needed in order to make sure we're not + # subtracting points from something that's naturally in the title + private def count_buzzphrases(query : String, video_name : String) : Int32 + good_phrases = 0 + bad_phrases = 0 + + GOLDEN_PHRASES.each do |gold_phrase| + gold_phrase = gold_phrase.downcase.gsub(/[^a-z0-9]/, "") + + if query.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase) + next + elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase) + good_phrases += 1 + end + end + + GARBAGE_PHRASES.each do |garbage_phrase| + garbage_phrase = garbage_phrase.downcase.gsub(/[^a-z0-9]/, "") + + if query.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase) + next + elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase) + bad_phrases += 1 + end + end + + return good_phrases - bad_phrases + end +end \ No newline at end of file diff --git a/src/search/youtube.cr b/src/search/youtube.cr index d8cc455..054af55 100755 --- a/src/search/youtube.cr +++ b/src/search/youtube.cr @@ -3,6 +3,8 @@ require "xml" require "json" require "uri" +require "./ranking" + module Youtube extend self @@ -12,17 +14,122 @@ module Youtube "yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link ", ] - GARBAGE_PHRASES = [ - "cover", "album", "live", "clean", "version", "full", "full album", "row", - "at", "@", "session", "how to", "npr music", "reimagined", "hr version", - "trailer", - ] + # Note that VID_VALUE_CLASS, VID_METADATA_CLASS, and YT_METADATA_CLASS are found in ranking.cr - GOLDEN_PHRASES = [ - "official video", "official music video", - ] + # Finds a youtube url based off of the given information. + # The query to youtube is constructed like this: + # " " + # If *download_first* is provided, the first link found will be downloaded. + # If *select_link* is provided, a menu of options will be shown for the user to choose their poison + # + # ``` + # Youtube.find_url("Bohemian Rhapsody", "Queen") + # => "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + # ``` + def find_url(spotify_metadata : JSON::Any, search_terms = "", + download_first = false, select_link = false) : String? - alias NODES_CLASS = Array(Hash(String, String)) + song_name = spotify_metadata["name"].as_s + artist_name = spotify_metadata["artists"][0]["name"].as_s + + human_query = song_name + " " + artist_name + " " + search_terms.strip + url_query = human_query.gsub(" ", "+") + + url = "https://www.youtube.com/results?search_query=" + url_query + + response = HTTP::Client.get(url) + + yt_metadata = get_yt_search_metadata(response.body) + + if yt_metadata.size == 0 + puts "There were no results for this query on youtube: \"#{human_query}\"" + return nil + end + + root = "https://youtube.com" + + if download_first + return root + yt_metadata[0]["href"] + end + + if select_link + # return select_link_menu() + end + + ranked = Ranker.rank_videos(spotify_metadata, yt_metadata, human_query) + + begin + return root + yt_metadata[ranked[0]["index"]]["href"] + rescue IndexError + return nil + end + + exit 1 + end + + # + private def select_link_menu() : String + + end + + # Finds valid video links from a `HTTP::Client.get` request + # Returns an `Array` of `NODES_CLASS` containing additional metadata from Youtube + private def get_yt_search_metadata(response_body : String) : YT_METADATA_CLASS + yt_initial_data : JSON::Any = JSON.parse("{}") + + response_body.each_line do |line| + # timestamp 11/8/2020: + # youtube's html page has a line previous to this literally with 'scraper_data_begin' as a comment + if line.includes?("var ytInitialData") + # Extract JSON data from line + data = line.split(" = ")[2].delete(';') + dataEnd = (data.index("") || 0) - 1 + + begin + yt_initial_data = JSON.parse(data[0..dataEnd]) + rescue + break + end + end + end + + if yt_initial_data == JSON.parse("{}") + puts "Youtube has changed the way it organizes its webpage, submit a bug" + puts "saying it has done so on https://github.com/cooperhammond/irs" + exit(1) + end + + # where the vid metadata lives + yt_initial_data = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"] + + video_metadata = [] of VID_METADATA_CLASS + + i = 0 + while true + begin + # video title + raw_metadata = yt_initial_data[0]["itemSectionRenderer"]["contents"][i]["videoRenderer"] + + metadata = {} of String => VID_VALUE_CLASS + + metadata["title"] = raw_metadata["title"]["runs"][0]["text"].as_s + metadata["href"] = raw_metadata["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"].as_s + timestamp = raw_metadata["lengthText"]["simpleText"].as_s + metadata["timestamp"] = timestamp + metadata["duration_ms"] = ((timestamp.split(":")[0].to_i * 60 + + timestamp.split(":")[1].to_i) * 1000).to_s + + + video_metadata.push(metadata) + rescue IndexError + break + rescue Exception + end + i += 1 + end + + return video_metadata + end # Checks if the given URL is a valid youtube URL # @@ -62,186 +169,4 @@ module Youtube return response.body.includes?("status=ok") end - - # Finds a youtube url based off of the given information. - # The query to youtube is constructed like this: - # " " - # If *download_first* is provided, the first link found will be downloaded. - # - # ``` - # Youtube.find_url("Bohemian Rhapsody", "Queen") - # => "https://www.youtube.com/watch?v=dQw4w9WgXcQ" - # ``` - def find_url(song_name : String, artist_name : String, search_terms = "", - download_first = false) : String? - query = (song_name + " " + artist_name + " " + search_terms).strip.gsub(" ", "+") - - url = "https://www.youtube.com/results?search_query=" + query - - response = HTTP::Client.get(url) - - valid_nodes = get_video_link_nodes(response.body) - - if valid_nodes.size == 0 - puts "There were no results for that query." - return nil - end - - root = "https://youtube.com" - - return root + valid_nodes[0]["href"] if download_first - - ranked = rank_videos(song_name, artist_name, query, valid_nodes) - - begin - return root + valid_nodes[ranked[0]["index"]]["href"] - rescue IndexError - return nil - end - end - - # Will rank videos according to their title and the user input - # Return: - # [ - # {"points" => x, "index" => x}, - # ... - # ] - private def rank_videos(song_name : String, artist_name : String, - query : String, nodes : Array(Hash(String, String))) : Array(Hash(String, Int32)) - points = [] of Hash(String, Int32) - index = 0 - - nodes.each do |node| - pts = 0 - - pts += points_compare(song_name, node["title"]) - pts += points_compare(artist_name, node["title"]) - pts += count_buzzphrases(query, node["title"]) - - points.push({ - "points" => pts, - "index" => index, - }) - index += 1 - end - - # Sort first by points and then by original index of the song - points.sort! { |a, b| - if b["points"] == a["points"] - a["index"] <=> b["index"] - else - b["points"] <=> a["points"] - end - } - - return points - end - - # Returns an `Int` based off the number of points worth assigning to the - # matchiness of the string. First the strings are downcased and then all - # nonalphanumeric characters are stripped. - # If *item1* includes *item2*, return 3 pts. - # If after the items have been blanked, *item1* includes *item2*, - # return 1 pts. - # Else, return 0 pts. - private def points_compare(item1 : String, item2 : String) : Int32 - if item2.includes?(item1) - return 3 - end - - item1 = item1.downcase.gsub(/[^a-z0-9]/, "") - item2 = item2.downcase.gsub(/[^a-z0-9]/, "") - - if item2.includes?(item1) - return 1 - else - return 0 - end - end - - # Checks if there are any phrases in the title of the video that would - # indicate audio having what we want. - # *video_name* is the title of the video, and *query* is what the user the - # program searched for. *query* is needed in order to make sure we're not - # subtracting points from something that's naturally in the title - private def count_buzzphrases(query : String, video_name : String) : Int32 - good_phrases = 0 - bad_phrases = 0 - - GOLDEN_PHRASES.each do |gold_phrase| - gold_phrase = gold_phrase.downcase.gsub(/[^a-z0-9]/, "") - - if query.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase) - next - elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase) - good_phrases += 1 - end - end - - GARBAGE_PHRASES.each do |garbage_phrase| - garbage_phrase = garbage_phrase.downcase.gsub(/[^a-z0-9]/, "") - - if query.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase) - next - elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase) - bad_phrases += 1 - end - end - - return good_phrases - bad_phrases - end - - # Finds valid video links from a `HTTP::Client.get` request - # Returns an `Array` of `XML::Node` - private def get_video_link_nodes(response_body : String) : NODES_CLASS - yt_initial_data : JSON::Any = JSON.parse("{}") - - response_body.each_line do |line| - # timestamp 11/8/2020: - # youtube's html page has a line previous to this literally with 'scraper_data_begin' as a comment - if line.includes?("var ytInitialData") - # Extract JSON data from line - data = line.split(" = ")[2].delete(';') - dataEnd = (data.index("") || 0) - 1 - - begin - yt_initial_data = JSON.parse(data[0..dataEnd]) - rescue - break - end - end - end - - if yt_initial_data == JSON.parse("{}") - puts "Youtube has changed the way it organizes its webpage, submit a bug" - puts "saying it has done so on https://github.com/cooperhammond/irs" - exit(1) - end - - # where the vid metadata lives - yt_initial_data = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"] - - video_metadata = [] of Hash(String, String) - - i = 0 - while true - begin - # video title - raw_metadata = yt_initial_data[0]["itemSectionRenderer"]["contents"][i]["videoRenderer"] - - metadata = {} of String => String - - metadata["title"] = raw_metadata["title"]["runs"][0]["text"].as_s - metadata["href"] = raw_metadata["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"].as_s - - video_metadata.push(metadata) - rescue IndexError - break - rescue Exception - end - i += 1 - end - - return video_metadata - end -end +end \ No newline at end of file