mirror of
https://github.com/cooperhammond/irs.git
synced 2024-12-22 17:35:28 +00:00
Merge pull request #77 from cooperhammond/search-improvement
Search improvement based on song duration
This commit is contained in:
commit
92e8885ae9
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -12,3 +12,4 @@
|
|||
ffmpeg
|
||||
ffprobe
|
||||
youtube-dl
|
||||
*.temp
|
|
@ -46,6 +46,7 @@ class TrackMapper
|
|||
type: Int32,
|
||||
setter: true
|
||||
},
|
||||
duration_ms: Int32,
|
||||
type: String,
|
||||
uri: String
|
||||
)
|
||||
|
|
|
@ -85,6 +85,8 @@ class Song
|
|||
end
|
||||
|
||||
data = @metadata.as(JSON::Any)
|
||||
@song_name = data["name"].as_s
|
||||
@artist_name = data["artists"][0]["name"].as_s
|
||||
@filename = "#{Pattern.parse(Config.filename_pattern, data)}.mp3"
|
||||
|
||||
if ask_url
|
||||
|
@ -97,7 +99,7 @@ class Song
|
|||
|
||||
if !url
|
||||
outputter("url", 0)
|
||||
url = Youtube.find_url(@song_name, @artist_name, search_terms: "lyrics")
|
||||
url = Youtube.find_url(data, search_terms: "lyrics")
|
||||
if !url
|
||||
raise("There was no url found on youtube for " +
|
||||
%("#{@song_name}" by "#{@artist_name}. ) +
|
||||
|
@ -119,29 +121,29 @@ class Song
|
|||
|
||||
outputter("albumart", 0)
|
||||
temp_albumart_filename = ".tempalbumart.jpg"
|
||||
HTTP::Client.get(data["album"]["images"][0]["url"].to_s) do |response|
|
||||
HTTP::Client.get(data["album"]["images"][0]["url"].as_s) do |response|
|
||||
File.write(temp_albumart_filename, response.body_io)
|
||||
end
|
||||
outputter("albumart", 0)
|
||||
|
||||
# check if song's metadata has been modded in playlist, update artist accordingly
|
||||
if data["artists"][-1]["owner"]?
|
||||
@artist = data["artists"][-1]["name"].to_s
|
||||
@artist = data["artists"][-1]["name"].as_s
|
||||
else
|
||||
@artist = data["artists"][0]["name"].to_s
|
||||
@artist = data["artists"][0]["name"].as_s
|
||||
end
|
||||
@album = data["album"]["name"].to_s
|
||||
@album = data["album"]["name"].as_s
|
||||
|
||||
tagger = Tags.new(@filename)
|
||||
tagger.add_album_art(temp_albumart_filename)
|
||||
tagger.add_text_tag("title", data["name"].to_s)
|
||||
tagger.add_text_tag("title", data["name"].as_s)
|
||||
tagger.add_text_tag("artist", @artist)
|
||||
|
||||
if !@album.empty?
|
||||
tagger.add_text_tag("album", @album)
|
||||
end
|
||||
|
||||
if genre = @spotify_searcher.find_genre(data["artists"][0]["id"].to_s)
|
||||
if genre = @spotify_searcher.find_genre(data["artists"][0]["id"].as_s)
|
||||
tagger.add_text_tag("genre", genre)
|
||||
end
|
||||
|
||||
|
|
144
src/search/ranking.cr
Normal file
144
src/search/ranking.cr
Normal file
|
@ -0,0 +1,144 @@
|
|||
alias VID_VALUE_CLASS = String
|
||||
alias VID_METADATA_CLASS = Hash(String, VID_VALUE_CLASS)
|
||||
alias YT_METADATA_CLASS = Array(VID_METADATA_CLASS)
|
||||
|
||||
module Ranker
|
||||
extend self
|
||||
|
||||
GARBAGE_PHRASES = [
|
||||
"cover", "album", "live", "clean", "version", "full", "full album", "row",
|
||||
"at", "@", "session", "how to", "npr music", "reimagined", "version",
|
||||
"trailer"
|
||||
]
|
||||
|
||||
GOLDEN_PHRASES = [
|
||||
"official video", "official music video",
|
||||
]
|
||||
|
||||
# Will rank videos according to their title and the user input, returns a sorted array of hashes
|
||||
# of the points a song was assigned and its original index
|
||||
# *spotify_metadata* is the metadate (from spotify) of the song that you want
|
||||
# *yt_metadata* is an array of hashes with metadata scraped from the youtube search result page
|
||||
# *query* is the query that you submitted to youtube for the results you now have
|
||||
# ```
|
||||
# Ranker.rank_videos(spotify_metadata, yt_metadata, query)
|
||||
# => [
|
||||
# {"points" => x, "index" => x},
|
||||
# ...
|
||||
# ]
|
||||
# ```
|
||||
# "index" corresponds to the original index of the song in yt_metadata
|
||||
def rank_videos(spotify_metadata : JSON::Any, yt_metadata : YT_METADATA_CLASS,
|
||||
query : String) : Array(Hash(String, Int32))
|
||||
points = [] of Hash(String, Int32)
|
||||
index = 0
|
||||
|
||||
actual_song_name = spotify_metadata["name"].as_s
|
||||
actual_artist_name = spotify_metadata["artists"][0]["name"].as_s
|
||||
|
||||
yt_metadata.each do |vid|
|
||||
pts = 0
|
||||
|
||||
pts += points_string_compare(actual_song_name, vid["title"])
|
||||
pts += points_string_compare(actual_artist_name, vid["title"])
|
||||
pts += count_buzzphrases(query, vid["title"])
|
||||
pts += compare_timestamps(spotify_metadata, vid)
|
||||
|
||||
points.push({
|
||||
"points" => pts,
|
||||
"index" => index,
|
||||
})
|
||||
index += 1
|
||||
end
|
||||
|
||||
# Sort first by points and then by original index of the song
|
||||
points.sort! { |a, b|
|
||||
if b["points"] == a["points"]
|
||||
a["index"] <=> b["index"]
|
||||
else
|
||||
b["points"] <=> a["points"]
|
||||
end
|
||||
}
|
||||
|
||||
return points
|
||||
end
|
||||
|
||||
# SINGULAR COMPONENT OF RANKING ALGORITHM
|
||||
private def compare_timestamps(spotify_metadata : JSON::Any, node : VID_METADATA_CLASS) : Int32
|
||||
# puts spotify_metadata.to_pretty_json()
|
||||
actual_time = spotify_metadata["duration_ms"].as_i
|
||||
vid_time = node["duration_ms"].to_i
|
||||
|
||||
difference = (actual_time - vid_time).abs
|
||||
|
||||
# puts "actual: #{actual_time}, vid: #{vid_time}"
|
||||
# puts "\tdiff: #{difference}"
|
||||
# puts "\ttitle: #{node["title"]}"
|
||||
|
||||
if difference <= 1000
|
||||
return 3
|
||||
elsif difference <= 2000
|
||||
return 2
|
||||
elsif difference <= 5000
|
||||
return 1
|
||||
else
|
||||
return 0
|
||||
end
|
||||
end
|
||||
|
||||
# SINGULAR COMPONENT OF RANKING ALGORITHM
|
||||
# Returns an `Int` based off the number of points worth assigning to the
|
||||
# matchiness of the string. First the strings are downcased and then all
|
||||
# nonalphanumeric characters are stripped.
|
||||
# If *item1* includes *item2*, return 3 pts.
|
||||
# If after the items have been blanked, *item1* includes *item2*,
|
||||
# return 1 pts.
|
||||
# Else, return 0 pts.
|
||||
private def points_string_compare(item1 : String, item2 : String) : Int32
|
||||
if item2.includes?(item1)
|
||||
return 3
|
||||
end
|
||||
|
||||
item1 = item1.downcase.gsub(/[^a-z0-9]/, "")
|
||||
item2 = item2.downcase.gsub(/[^a-z0-9]/, "")
|
||||
|
||||
if item2.includes?(item1)
|
||||
return 1
|
||||
else
|
||||
return 0
|
||||
end
|
||||
end
|
||||
|
||||
# SINGULAR COMPONENT OF RANKING ALGORITHM
|
||||
# Checks if there are any phrases in the title of the video that would
|
||||
# indicate audio having what we want.
|
||||
# *video_name* is the title of the video, and *query* is what the user the
|
||||
# program searched for. *query* is needed in order to make sure we're not
|
||||
# subtracting points from something that's naturally in the title
|
||||
private def count_buzzphrases(query : String, video_name : String) : Int32
|
||||
good_phrases = 0
|
||||
bad_phrases = 0
|
||||
|
||||
GOLDEN_PHRASES.each do |gold_phrase|
|
||||
gold_phrase = gold_phrase.downcase.gsub(/[^a-z0-9]/, "")
|
||||
|
||||
if query.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
|
||||
next
|
||||
elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
|
||||
good_phrases += 1
|
||||
end
|
||||
end
|
||||
|
||||
GARBAGE_PHRASES.each do |garbage_phrase|
|
||||
garbage_phrase = garbage_phrase.downcase.gsub(/[^a-z0-9]/, "")
|
||||
|
||||
if query.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
|
||||
next
|
||||
elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
|
||||
bad_phrases += 1
|
||||
end
|
||||
end
|
||||
|
||||
return good_phrases - bad_phrases
|
||||
end
|
||||
end
|
|
@ -3,6 +3,8 @@ require "xml"
|
|||
require "json"
|
||||
require "uri"
|
||||
|
||||
require "./ranking"
|
||||
|
||||
|
||||
module Youtube
|
||||
extend self
|
||||
|
@ -12,17 +14,122 @@ module Youtube
|
|||
"yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link ",
|
||||
]
|
||||
|
||||
GARBAGE_PHRASES = [
|
||||
"cover", "album", "live", "clean", "version", "full", "full album", "row",
|
||||
"at", "@", "session", "how to", "npr music", "reimagined", "hr version",
|
||||
"trailer",
|
||||
]
|
||||
# Note that VID_VALUE_CLASS, VID_METADATA_CLASS, and YT_METADATA_CLASS are found in ranking.cr
|
||||
|
||||
GOLDEN_PHRASES = [
|
||||
"official video", "official music video",
|
||||
]
|
||||
# Finds a youtube url based off of the given information.
|
||||
# The query to youtube is constructed like this:
|
||||
# "<song_name> <artist_name> <search terms>"
|
||||
# If *download_first* is provided, the first link found will be downloaded.
|
||||
# If *select_link* is provided, a menu of options will be shown for the user to choose their poison
|
||||
#
|
||||
# ```
|
||||
# Youtube.find_url("Bohemian Rhapsody", "Queen")
|
||||
# => "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
||||
# ```
|
||||
def find_url(spotify_metadata : JSON::Any, search_terms = "",
|
||||
download_first = false, select_link = false) : String?
|
||||
|
||||
alias NODES_CLASS = Array(Hash(String, String))
|
||||
song_name = spotify_metadata["name"].as_s
|
||||
artist_name = spotify_metadata["artists"][0]["name"].as_s
|
||||
|
||||
human_query = song_name + " " + artist_name + " " + search_terms.strip
|
||||
url_query = human_query.gsub(" ", "+")
|
||||
|
||||
url = "https://www.youtube.com/results?search_query=" + url_query
|
||||
|
||||
response = HTTP::Client.get(url)
|
||||
|
||||
yt_metadata = get_yt_search_metadata(response.body)
|
||||
|
||||
if yt_metadata.size == 0
|
||||
puts "There were no results for this query on youtube: \"#{human_query}\""
|
||||
return nil
|
||||
end
|
||||
|
||||
root = "https://youtube.com"
|
||||
|
||||
if download_first
|
||||
return root + yt_metadata[0]["href"]
|
||||
end
|
||||
|
||||
if select_link
|
||||
# return select_link_menu()
|
||||
end
|
||||
|
||||
ranked = Ranker.rank_videos(spotify_metadata, yt_metadata, human_query)
|
||||
|
||||
begin
|
||||
return root + yt_metadata[ranked[0]["index"]]["href"]
|
||||
rescue IndexError
|
||||
return nil
|
||||
end
|
||||
|
||||
exit 1
|
||||
end
|
||||
|
||||
#
|
||||
private def select_link_menu() : String
|
||||
|
||||
end
|
||||
|
||||
# Finds valid video links from a `HTTP::Client.get` request
|
||||
# Returns an `Array` of `NODES_CLASS` containing additional metadata from Youtube
|
||||
private def get_yt_search_metadata(response_body : String) : YT_METADATA_CLASS
|
||||
yt_initial_data : JSON::Any = JSON.parse("{}")
|
||||
|
||||
response_body.each_line do |line|
|
||||
# timestamp 11/8/2020:
|
||||
# youtube's html page has a line previous to this literally with 'scraper_data_begin' as a comment
|
||||
if line.includes?("var ytInitialData")
|
||||
# Extract JSON data from line
|
||||
data = line.split(" = ")[2].delete(';')
|
||||
dataEnd = (data.index("</script>") || 0) - 1
|
||||
|
||||
begin
|
||||
yt_initial_data = JSON.parse(data[0..dataEnd])
|
||||
rescue
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if yt_initial_data == JSON.parse("{}")
|
||||
puts "Youtube has changed the way it organizes its webpage, submit a bug"
|
||||
puts "saying it has done so on https://github.com/cooperhammond/irs"
|
||||
exit(1)
|
||||
end
|
||||
|
||||
# where the vid metadata lives
|
||||
yt_initial_data = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"]
|
||||
|
||||
video_metadata = [] of VID_METADATA_CLASS
|
||||
|
||||
i = 0
|
||||
while true
|
||||
begin
|
||||
# video title
|
||||
raw_metadata = yt_initial_data[0]["itemSectionRenderer"]["contents"][i]["videoRenderer"]
|
||||
|
||||
metadata = {} of String => VID_VALUE_CLASS
|
||||
|
||||
metadata["title"] = raw_metadata["title"]["runs"][0]["text"].as_s
|
||||
metadata["href"] = raw_metadata["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"].as_s
|
||||
timestamp = raw_metadata["lengthText"]["simpleText"].as_s
|
||||
metadata["timestamp"] = timestamp
|
||||
metadata["duration_ms"] = ((timestamp.split(":")[0].to_i * 60 +
|
||||
timestamp.split(":")[1].to_i) * 1000).to_s
|
||||
|
||||
|
||||
video_metadata.push(metadata)
|
||||
rescue IndexError
|
||||
break
|
||||
rescue Exception
|
||||
end
|
||||
i += 1
|
||||
end
|
||||
|
||||
return video_metadata
|
||||
end
|
||||
|
||||
# Checks if the given URL is a valid youtube URL
|
||||
#
|
||||
|
@ -62,186 +169,4 @@ module Youtube
|
|||
|
||||
return response.body.includes?("status=ok")
|
||||
end
|
||||
|
||||
# Finds a youtube url based off of the given information.
|
||||
# The query to youtube is constructed like this:
|
||||
# "<song_name> <artist_name> <search terms>"
|
||||
# If *download_first* is provided, the first link found will be downloaded.
|
||||
#
|
||||
# ```
|
||||
# Youtube.find_url("Bohemian Rhapsody", "Queen")
|
||||
# => "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
||||
# ```
|
||||
def find_url(song_name : String, artist_name : String, search_terms = "",
|
||||
download_first = false) : String?
|
||||
query = (song_name + " " + artist_name + " " + search_terms).strip.gsub(" ", "+")
|
||||
|
||||
url = "https://www.youtube.com/results?search_query=" + query
|
||||
|
||||
response = HTTP::Client.get(url)
|
||||
|
||||
valid_nodes = get_video_link_nodes(response.body)
|
||||
|
||||
if valid_nodes.size == 0
|
||||
puts "There were no results for that query."
|
||||
return nil
|
||||
end
|
||||
|
||||
root = "https://youtube.com"
|
||||
|
||||
return root + valid_nodes[0]["href"] if download_first
|
||||
|
||||
ranked = rank_videos(song_name, artist_name, query, valid_nodes)
|
||||
|
||||
begin
|
||||
return root + valid_nodes[ranked[0]["index"]]["href"]
|
||||
rescue IndexError
|
||||
return nil
|
||||
end
|
||||
end
|
||||
|
||||
# Will rank videos according to their title and the user input
|
||||
# Return:
|
||||
# [
|
||||
# {"points" => x, "index" => x},
|
||||
# ...
|
||||
# ]
|
||||
private def rank_videos(song_name : String, artist_name : String,
|
||||
query : String, nodes : Array(Hash(String, String))) : Array(Hash(String, Int32))
|
||||
points = [] of Hash(String, Int32)
|
||||
index = 0
|
||||
|
||||
nodes.each do |node|
|
||||
pts = 0
|
||||
|
||||
pts += points_compare(song_name, node["title"])
|
||||
pts += points_compare(artist_name, node["title"])
|
||||
pts += count_buzzphrases(query, node["title"])
|
||||
|
||||
points.push({
|
||||
"points" => pts,
|
||||
"index" => index,
|
||||
})
|
||||
index += 1
|
||||
end
|
||||
|
||||
# Sort first by points and then by original index of the song
|
||||
points.sort! { |a, b|
|
||||
if b["points"] == a["points"]
|
||||
a["index"] <=> b["index"]
|
||||
else
|
||||
b["points"] <=> a["points"]
|
||||
end
|
||||
}
|
||||
|
||||
return points
|
||||
end
|
||||
|
||||
# Returns an `Int` based off the number of points worth assigning to the
|
||||
# matchiness of the string. First the strings are downcased and then all
|
||||
# nonalphanumeric characters are stripped.
|
||||
# If *item1* includes *item2*, return 3 pts.
|
||||
# If after the items have been blanked, *item1* includes *item2*,
|
||||
# return 1 pts.
|
||||
# Else, return 0 pts.
|
||||
private def points_compare(item1 : String, item2 : String) : Int32
|
||||
if item2.includes?(item1)
|
||||
return 3
|
||||
end
|
||||
|
||||
item1 = item1.downcase.gsub(/[^a-z0-9]/, "")
|
||||
item2 = item2.downcase.gsub(/[^a-z0-9]/, "")
|
||||
|
||||
if item2.includes?(item1)
|
||||
return 1
|
||||
else
|
||||
return 0
|
||||
end
|
||||
end
|
||||
|
||||
# Checks if there are any phrases in the title of the video that would
|
||||
# indicate audio having what we want.
|
||||
# *video_name* is the title of the video, and *query* is what the user the
|
||||
# program searched for. *query* is needed in order to make sure we're not
|
||||
# subtracting points from something that's naturally in the title
|
||||
private def count_buzzphrases(query : String, video_name : String) : Int32
|
||||
good_phrases = 0
|
||||
bad_phrases = 0
|
||||
|
||||
GOLDEN_PHRASES.each do |gold_phrase|
|
||||
gold_phrase = gold_phrase.downcase.gsub(/[^a-z0-9]/, "")
|
||||
|
||||
if query.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
|
||||
next
|
||||
elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
|
||||
good_phrases += 1
|
||||
end
|
||||
end
|
||||
|
||||
GARBAGE_PHRASES.each do |garbage_phrase|
|
||||
garbage_phrase = garbage_phrase.downcase.gsub(/[^a-z0-9]/, "")
|
||||
|
||||
if query.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
|
||||
next
|
||||
elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
|
||||
bad_phrases += 1
|
||||
end
|
||||
end
|
||||
|
||||
return good_phrases - bad_phrases
|
||||
end
|
||||
|
||||
# Finds valid video links from a `HTTP::Client.get` request
|
||||
# Returns an `Array` of `XML::Node`
|
||||
private def get_video_link_nodes(response_body : String) : NODES_CLASS
|
||||
yt_initial_data : JSON::Any = JSON.parse("{}")
|
||||
|
||||
response_body.each_line do |line|
|
||||
# timestamp 11/8/2020:
|
||||
# youtube's html page has a line previous to this literally with 'scraper_data_begin' as a comment
|
||||
if line.includes?("var ytInitialData")
|
||||
# Extract JSON data from line
|
||||
data = line.split(" = ")[2].delete(';')
|
||||
dataEnd = (data.index("</script>") || 0) - 1
|
||||
|
||||
begin
|
||||
yt_initial_data = JSON.parse(data[0..dataEnd])
|
||||
rescue
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if yt_initial_data == JSON.parse("{}")
|
||||
puts "Youtube has changed the way it organizes its webpage, submit a bug"
|
||||
puts "saying it has done so on https://github.com/cooperhammond/irs"
|
||||
exit(1)
|
||||
end
|
||||
|
||||
# where the vid metadata lives
|
||||
yt_initial_data = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"]
|
||||
|
||||
video_metadata = [] of Hash(String, String)
|
||||
|
||||
i = 0
|
||||
while true
|
||||
begin
|
||||
# video title
|
||||
raw_metadata = yt_initial_data[0]["itemSectionRenderer"]["contents"][i]["videoRenderer"]
|
||||
|
||||
metadata = {} of String => String
|
||||
|
||||
metadata["title"] = raw_metadata["title"]["runs"][0]["text"].as_s
|
||||
metadata["href"] = raw_metadata["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"].as_s
|
||||
|
||||
video_metadata.push(metadata)
|
||||
rescue IndexError
|
||||
break
|
||||
rescue Exception
|
||||
end
|
||||
i += 1
|
||||
end
|
||||
|
||||
return video_metadata
|
||||
end
|
||||
end
|
Loading…
Reference in a new issue