Merge pull request #77 from cooperhammond/search-improvement

Search improvement based on song duration
This commit is contained in:
Cooper Hammond 2021-04-15 09:45:27 -06:00 committed by GitHub
commit 92e8885ae9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 273 additions and 200 deletions

1
.gitignore vendored
View file

@ -12,3 +12,4 @@
ffmpeg
ffprobe
youtube-dl
*.temp

View file

@ -46,6 +46,7 @@ class TrackMapper
type: Int32,
setter: true
},
duration_ms: Int32,
type: String,
uri: String
)

View file

@ -85,6 +85,8 @@ class Song
end
data = @metadata.as(JSON::Any)
@song_name = data["name"].as_s
@artist_name = data["artists"][0]["name"].as_s
@filename = "#{Pattern.parse(Config.filename_pattern, data)}.mp3"
if ask_url
@ -97,7 +99,7 @@ class Song
if !url
outputter("url", 0)
url = Youtube.find_url(@song_name, @artist_name, search_terms: "lyrics")
url = Youtube.find_url(data, search_terms: "lyrics")
if !url
raise("There was no url found on youtube for " +
%("#{@song_name}" by "#{@artist_name}. ) +
@ -119,29 +121,29 @@ class Song
outputter("albumart", 0)
temp_albumart_filename = ".tempalbumart.jpg"
HTTP::Client.get(data["album"]["images"][0]["url"].to_s) do |response|
HTTP::Client.get(data["album"]["images"][0]["url"].as_s) do |response|
File.write(temp_albumart_filename, response.body_io)
end
outputter("albumart", 0)
# check if song's metadata has been modded in playlist, update artist accordingly
if data["artists"][-1]["owner"]?
@artist = data["artists"][-1]["name"].to_s
@artist = data["artists"][-1]["name"].as_s
else
@artist = data["artists"][0]["name"].to_s
@artist = data["artists"][0]["name"].as_s
end
@album = data["album"]["name"].to_s
@album = data["album"]["name"].as_s
tagger = Tags.new(@filename)
tagger.add_album_art(temp_albumart_filename)
tagger.add_text_tag("title", data["name"].to_s)
tagger.add_text_tag("title", data["name"].as_s)
tagger.add_text_tag("artist", @artist)
if !@album.empty?
tagger.add_text_tag("album", @album)
end
if genre = @spotify_searcher.find_genre(data["artists"][0]["id"].to_s)
if genre = @spotify_searcher.find_genre(data["artists"][0]["id"].as_s)
tagger.add_text_tag("genre", genre)
end

144
src/search/ranking.cr Normal file
View file

@ -0,0 +1,144 @@
alias VID_VALUE_CLASS = String
alias VID_METADATA_CLASS = Hash(String, VID_VALUE_CLASS)
alias YT_METADATA_CLASS = Array(VID_METADATA_CLASS)
module Ranker
extend self
GARBAGE_PHRASES = [
"cover", "album", "live", "clean", "version", "full", "full album", "row",
"at", "@", "session", "how to", "npr music", "reimagined", "version",
"trailer"
]
GOLDEN_PHRASES = [
"official video", "official music video",
]
# Will rank videos according to their title and the user input, returns a sorted array of hashes
# of the points a song was assigned and its original index
# *spotify_metadata* is the metadate (from spotify) of the song that you want
# *yt_metadata* is an array of hashes with metadata scraped from the youtube search result page
# *query* is the query that you submitted to youtube for the results you now have
# ```
# Ranker.rank_videos(spotify_metadata, yt_metadata, query)
# => [
# {"points" => x, "index" => x},
# ...
# ]
# ```
# "index" corresponds to the original index of the song in yt_metadata
def rank_videos(spotify_metadata : JSON::Any, yt_metadata : YT_METADATA_CLASS,
query : String) : Array(Hash(String, Int32))
points = [] of Hash(String, Int32)
index = 0
actual_song_name = spotify_metadata["name"].as_s
actual_artist_name = spotify_metadata["artists"][0]["name"].as_s
yt_metadata.each do |vid|
pts = 0
pts += points_string_compare(actual_song_name, vid["title"])
pts += points_string_compare(actual_artist_name, vid["title"])
pts += count_buzzphrases(query, vid["title"])
pts += compare_timestamps(spotify_metadata, vid)
points.push({
"points" => pts,
"index" => index,
})
index += 1
end
# Sort first by points and then by original index of the song
points.sort! { |a, b|
if b["points"] == a["points"]
a["index"] <=> b["index"]
else
b["points"] <=> a["points"]
end
}
return points
end
# SINGULAR COMPONENT OF RANKING ALGORITHM
private def compare_timestamps(spotify_metadata : JSON::Any, node : VID_METADATA_CLASS) : Int32
# puts spotify_metadata.to_pretty_json()
actual_time = spotify_metadata["duration_ms"].as_i
vid_time = node["duration_ms"].to_i
difference = (actual_time - vid_time).abs
# puts "actual: #{actual_time}, vid: #{vid_time}"
# puts "\tdiff: #{difference}"
# puts "\ttitle: #{node["title"]}"
if difference <= 1000
return 3
elsif difference <= 2000
return 2
elsif difference <= 5000
return 1
else
return 0
end
end
# SINGULAR COMPONENT OF RANKING ALGORITHM
# Returns an `Int` based off the number of points worth assigning to the
# matchiness of the string. First the strings are downcased and then all
# nonalphanumeric characters are stripped.
# If *item1* includes *item2*, return 3 pts.
# If after the items have been blanked, *item1* includes *item2*,
# return 1 pts.
# Else, return 0 pts.
private def points_string_compare(item1 : String, item2 : String) : Int32
if item2.includes?(item1)
return 3
end
item1 = item1.downcase.gsub(/[^a-z0-9]/, "")
item2 = item2.downcase.gsub(/[^a-z0-9]/, "")
if item2.includes?(item1)
return 1
else
return 0
end
end
# SINGULAR COMPONENT OF RANKING ALGORITHM
# Checks if there are any phrases in the title of the video that would
# indicate audio having what we want.
# *video_name* is the title of the video, and *query* is what the user the
# program searched for. *query* is needed in order to make sure we're not
# subtracting points from something that's naturally in the title
private def count_buzzphrases(query : String, video_name : String) : Int32
good_phrases = 0
bad_phrases = 0
GOLDEN_PHRASES.each do |gold_phrase|
gold_phrase = gold_phrase.downcase.gsub(/[^a-z0-9]/, "")
if query.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
next
elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
good_phrases += 1
end
end
GARBAGE_PHRASES.each do |garbage_phrase|
garbage_phrase = garbage_phrase.downcase.gsub(/[^a-z0-9]/, "")
if query.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
next
elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
bad_phrases += 1
end
end
return good_phrases - bad_phrases
end
end

View file

@ -3,6 +3,8 @@ require "xml"
require "json"
require "uri"
require "./ranking"
module Youtube
extend self
@ -12,17 +14,122 @@ module Youtube
"yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link ",
]
GARBAGE_PHRASES = [
"cover", "album", "live", "clean", "version", "full", "full album", "row",
"at", "@", "session", "how to", "npr music", "reimagined", "hr version",
"trailer",
]
# Note that VID_VALUE_CLASS, VID_METADATA_CLASS, and YT_METADATA_CLASS are found in ranking.cr
GOLDEN_PHRASES = [
"official video", "official music video",
]
# Finds a youtube url based off of the given information.
# The query to youtube is constructed like this:
# "<song_name> <artist_name> <search terms>"
# If *download_first* is provided, the first link found will be downloaded.
# If *select_link* is provided, a menu of options will be shown for the user to choose their poison
#
# ```
# Youtube.find_url("Bohemian Rhapsody", "Queen")
# => "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
# ```
def find_url(spotify_metadata : JSON::Any, search_terms = "",
download_first = false, select_link = false) : String?
alias NODES_CLASS = Array(Hash(String, String))
song_name = spotify_metadata["name"].as_s
artist_name = spotify_metadata["artists"][0]["name"].as_s
human_query = song_name + " " + artist_name + " " + search_terms.strip
url_query = human_query.gsub(" ", "+")
url = "https://www.youtube.com/results?search_query=" + url_query
response = HTTP::Client.get(url)
yt_metadata = get_yt_search_metadata(response.body)
if yt_metadata.size == 0
puts "There were no results for this query on youtube: \"#{human_query}\""
return nil
end
root = "https://youtube.com"
if download_first
return root + yt_metadata[0]["href"]
end
if select_link
# return select_link_menu()
end
ranked = Ranker.rank_videos(spotify_metadata, yt_metadata, human_query)
begin
return root + yt_metadata[ranked[0]["index"]]["href"]
rescue IndexError
return nil
end
exit 1
end
#
private def select_link_menu() : String
end
# Finds valid video links from a `HTTP::Client.get` request
# Returns an `Array` of `NODES_CLASS` containing additional metadata from Youtube
private def get_yt_search_metadata(response_body : String) : YT_METADATA_CLASS
yt_initial_data : JSON::Any = JSON.parse("{}")
response_body.each_line do |line|
# timestamp 11/8/2020:
# youtube's html page has a line previous to this literally with 'scraper_data_begin' as a comment
if line.includes?("var ytInitialData")
# Extract JSON data from line
data = line.split(" = ")[2].delete(';')
dataEnd = (data.index("</script>") || 0) - 1
begin
yt_initial_data = JSON.parse(data[0..dataEnd])
rescue
break
end
end
end
if yt_initial_data == JSON.parse("{}")
puts "Youtube has changed the way it organizes its webpage, submit a bug"
puts "saying it has done so on https://github.com/cooperhammond/irs"
exit(1)
end
# where the vid metadata lives
yt_initial_data = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"]
video_metadata = [] of VID_METADATA_CLASS
i = 0
while true
begin
# video title
raw_metadata = yt_initial_data[0]["itemSectionRenderer"]["contents"][i]["videoRenderer"]
metadata = {} of String => VID_VALUE_CLASS
metadata["title"] = raw_metadata["title"]["runs"][0]["text"].as_s
metadata["href"] = raw_metadata["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"].as_s
timestamp = raw_metadata["lengthText"]["simpleText"].as_s
metadata["timestamp"] = timestamp
metadata["duration_ms"] = ((timestamp.split(":")[0].to_i * 60 +
timestamp.split(":")[1].to_i) * 1000).to_s
video_metadata.push(metadata)
rescue IndexError
break
rescue Exception
end
i += 1
end
return video_metadata
end
# Checks if the given URL is a valid youtube URL
#
@ -62,186 +169,4 @@ module Youtube
return response.body.includes?("status=ok")
end
# Finds a youtube url based off of the given information.
# The query to youtube is constructed like this:
# "<song_name> <artist_name> <search terms>"
# If *download_first* is provided, the first link found will be downloaded.
#
# ```
# Youtube.find_url("Bohemian Rhapsody", "Queen")
# => "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
# ```
def find_url(song_name : String, artist_name : String, search_terms = "",
download_first = false) : String?
query = (song_name + " " + artist_name + " " + search_terms).strip.gsub(" ", "+")
url = "https://www.youtube.com/results?search_query=" + query
response = HTTP::Client.get(url)
valid_nodes = get_video_link_nodes(response.body)
if valid_nodes.size == 0
puts "There were no results for that query."
return nil
end
root = "https://youtube.com"
return root + valid_nodes[0]["href"] if download_first
ranked = rank_videos(song_name, artist_name, query, valid_nodes)
begin
return root + valid_nodes[ranked[0]["index"]]["href"]
rescue IndexError
return nil
end
end
# Will rank videos according to their title and the user input
# Return:
# [
# {"points" => x, "index" => x},
# ...
# ]
private def rank_videos(song_name : String, artist_name : String,
query : String, nodes : Array(Hash(String, String))) : Array(Hash(String, Int32))
points = [] of Hash(String, Int32)
index = 0
nodes.each do |node|
pts = 0
pts += points_compare(song_name, node["title"])
pts += points_compare(artist_name, node["title"])
pts += count_buzzphrases(query, node["title"])
points.push({
"points" => pts,
"index" => index,
})
index += 1
end
# Sort first by points and then by original index of the song
points.sort! { |a, b|
if b["points"] == a["points"]
a["index"] <=> b["index"]
else
b["points"] <=> a["points"]
end
}
return points
end
# Returns an `Int` based off the number of points worth assigning to the
# matchiness of the string. First the strings are downcased and then all
# nonalphanumeric characters are stripped.
# If *item1* includes *item2*, return 3 pts.
# If after the items have been blanked, *item1* includes *item2*,
# return 1 pts.
# Else, return 0 pts.
private def points_compare(item1 : String, item2 : String) : Int32
if item2.includes?(item1)
return 3
end
item1 = item1.downcase.gsub(/[^a-z0-9]/, "")
item2 = item2.downcase.gsub(/[^a-z0-9]/, "")
if item2.includes?(item1)
return 1
else
return 0
end
end
# Checks if there are any phrases in the title of the video that would
# indicate audio having what we want.
# *video_name* is the title of the video, and *query* is what the user the
# program searched for. *query* is needed in order to make sure we're not
# subtracting points from something that's naturally in the title
private def count_buzzphrases(query : String, video_name : String) : Int32
good_phrases = 0
bad_phrases = 0
GOLDEN_PHRASES.each do |gold_phrase|
gold_phrase = gold_phrase.downcase.gsub(/[^a-z0-9]/, "")
if query.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
next
elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
good_phrases += 1
end
end
GARBAGE_PHRASES.each do |garbage_phrase|
garbage_phrase = garbage_phrase.downcase.gsub(/[^a-z0-9]/, "")
if query.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
next
elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
bad_phrases += 1
end
end
return good_phrases - bad_phrases
end
# Finds valid video links from a `HTTP::Client.get` request
# Returns an `Array` of `XML::Node`
private def get_video_link_nodes(response_body : String) : NODES_CLASS
yt_initial_data : JSON::Any = JSON.parse("{}")
response_body.each_line do |line|
# timestamp 11/8/2020:
# youtube's html page has a line previous to this literally with 'scraper_data_begin' as a comment
if line.includes?("var ytInitialData")
# Extract JSON data from line
data = line.split(" = ")[2].delete(';')
dataEnd = (data.index("</script>") || 0) - 1
begin
yt_initial_data = JSON.parse(data[0..dataEnd])
rescue
break
end
end
end
if yt_initial_data == JSON.parse("{}")
puts "Youtube has changed the way it organizes its webpage, submit a bug"
puts "saying it has done so on https://github.com/cooperhammond/irs"
exit(1)
end
# where the vid metadata lives
yt_initial_data = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"]
video_metadata = [] of Hash(String, String)
i = 0
while true
begin
# video title
raw_metadata = yt_initial_data[0]["itemSectionRenderer"]["contents"][i]["videoRenderer"]
metadata = {} of String => String
metadata["title"] = raw_metadata["title"]["runs"][0]["text"].as_s
metadata["href"] = raw_metadata["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"].as_s
video_metadata.push(metadata)
rescue IndexError
break
rescue Exception
end
i += 1
end
return video_metadata
end
end