mirror of
https://github.com/cooperhammond/irs.git
synced 2024-12-22 17:35:28 +00:00
Merge pull request #77 from cooperhammond/search-improvement
Search improvement based on song duration
This commit is contained in:
commit
92e8885ae9
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -12,3 +12,4 @@
|
||||||
ffmpeg
|
ffmpeg
|
||||||
ffprobe
|
ffprobe
|
||||||
youtube-dl
|
youtube-dl
|
||||||
|
*.temp
|
|
@ -46,6 +46,7 @@ class TrackMapper
|
||||||
type: Int32,
|
type: Int32,
|
||||||
setter: true
|
setter: true
|
||||||
},
|
},
|
||||||
|
duration_ms: Int32,
|
||||||
type: String,
|
type: String,
|
||||||
uri: String
|
uri: String
|
||||||
)
|
)
|
||||||
|
|
|
@ -85,6 +85,8 @@ class Song
|
||||||
end
|
end
|
||||||
|
|
||||||
data = @metadata.as(JSON::Any)
|
data = @metadata.as(JSON::Any)
|
||||||
|
@song_name = data["name"].as_s
|
||||||
|
@artist_name = data["artists"][0]["name"].as_s
|
||||||
@filename = "#{Pattern.parse(Config.filename_pattern, data)}.mp3"
|
@filename = "#{Pattern.parse(Config.filename_pattern, data)}.mp3"
|
||||||
|
|
||||||
if ask_url
|
if ask_url
|
||||||
|
@ -97,7 +99,7 @@ class Song
|
||||||
|
|
||||||
if !url
|
if !url
|
||||||
outputter("url", 0)
|
outputter("url", 0)
|
||||||
url = Youtube.find_url(@song_name, @artist_name, search_terms: "lyrics")
|
url = Youtube.find_url(data, search_terms: "lyrics")
|
||||||
if !url
|
if !url
|
||||||
raise("There was no url found on youtube for " +
|
raise("There was no url found on youtube for " +
|
||||||
%("#{@song_name}" by "#{@artist_name}. ) +
|
%("#{@song_name}" by "#{@artist_name}. ) +
|
||||||
|
@ -119,29 +121,29 @@ class Song
|
||||||
|
|
||||||
outputter("albumart", 0)
|
outputter("albumart", 0)
|
||||||
temp_albumart_filename = ".tempalbumart.jpg"
|
temp_albumart_filename = ".tempalbumart.jpg"
|
||||||
HTTP::Client.get(data["album"]["images"][0]["url"].to_s) do |response|
|
HTTP::Client.get(data["album"]["images"][0]["url"].as_s) do |response|
|
||||||
File.write(temp_albumart_filename, response.body_io)
|
File.write(temp_albumart_filename, response.body_io)
|
||||||
end
|
end
|
||||||
outputter("albumart", 0)
|
outputter("albumart", 0)
|
||||||
|
|
||||||
# check if song's metadata has been modded in playlist, update artist accordingly
|
# check if song's metadata has been modded in playlist, update artist accordingly
|
||||||
if data["artists"][-1]["owner"]?
|
if data["artists"][-1]["owner"]?
|
||||||
@artist = data["artists"][-1]["name"].to_s
|
@artist = data["artists"][-1]["name"].as_s
|
||||||
else
|
else
|
||||||
@artist = data["artists"][0]["name"].to_s
|
@artist = data["artists"][0]["name"].as_s
|
||||||
end
|
end
|
||||||
@album = data["album"]["name"].to_s
|
@album = data["album"]["name"].as_s
|
||||||
|
|
||||||
tagger = Tags.new(@filename)
|
tagger = Tags.new(@filename)
|
||||||
tagger.add_album_art(temp_albumart_filename)
|
tagger.add_album_art(temp_albumart_filename)
|
||||||
tagger.add_text_tag("title", data["name"].to_s)
|
tagger.add_text_tag("title", data["name"].as_s)
|
||||||
tagger.add_text_tag("artist", @artist)
|
tagger.add_text_tag("artist", @artist)
|
||||||
|
|
||||||
if !@album.empty?
|
if !@album.empty?
|
||||||
tagger.add_text_tag("album", @album)
|
tagger.add_text_tag("album", @album)
|
||||||
end
|
end
|
||||||
|
|
||||||
if genre = @spotify_searcher.find_genre(data["artists"][0]["id"].to_s)
|
if genre = @spotify_searcher.find_genre(data["artists"][0]["id"].as_s)
|
||||||
tagger.add_text_tag("genre", genre)
|
tagger.add_text_tag("genre", genre)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
144
src/search/ranking.cr
Normal file
144
src/search/ranking.cr
Normal file
|
@ -0,0 +1,144 @@
|
||||||
|
alias VID_VALUE_CLASS = String
|
||||||
|
alias VID_METADATA_CLASS = Hash(String, VID_VALUE_CLASS)
|
||||||
|
alias YT_METADATA_CLASS = Array(VID_METADATA_CLASS)
|
||||||
|
|
||||||
|
module Ranker
|
||||||
|
extend self
|
||||||
|
|
||||||
|
GARBAGE_PHRASES = [
|
||||||
|
"cover", "album", "live", "clean", "version", "full", "full album", "row",
|
||||||
|
"at", "@", "session", "how to", "npr music", "reimagined", "version",
|
||||||
|
"trailer"
|
||||||
|
]
|
||||||
|
|
||||||
|
GOLDEN_PHRASES = [
|
||||||
|
"official video", "official music video",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Will rank videos according to their title and the user input, returns a sorted array of hashes
|
||||||
|
# of the points a song was assigned and its original index
|
||||||
|
# *spotify_metadata* is the metadate (from spotify) of the song that you want
|
||||||
|
# *yt_metadata* is an array of hashes with metadata scraped from the youtube search result page
|
||||||
|
# *query* is the query that you submitted to youtube for the results you now have
|
||||||
|
# ```
|
||||||
|
# Ranker.rank_videos(spotify_metadata, yt_metadata, query)
|
||||||
|
# => [
|
||||||
|
# {"points" => x, "index" => x},
|
||||||
|
# ...
|
||||||
|
# ]
|
||||||
|
# ```
|
||||||
|
# "index" corresponds to the original index of the song in yt_metadata
|
||||||
|
def rank_videos(spotify_metadata : JSON::Any, yt_metadata : YT_METADATA_CLASS,
|
||||||
|
query : String) : Array(Hash(String, Int32))
|
||||||
|
points = [] of Hash(String, Int32)
|
||||||
|
index = 0
|
||||||
|
|
||||||
|
actual_song_name = spotify_metadata["name"].as_s
|
||||||
|
actual_artist_name = spotify_metadata["artists"][0]["name"].as_s
|
||||||
|
|
||||||
|
yt_metadata.each do |vid|
|
||||||
|
pts = 0
|
||||||
|
|
||||||
|
pts += points_string_compare(actual_song_name, vid["title"])
|
||||||
|
pts += points_string_compare(actual_artist_name, vid["title"])
|
||||||
|
pts += count_buzzphrases(query, vid["title"])
|
||||||
|
pts += compare_timestamps(spotify_metadata, vid)
|
||||||
|
|
||||||
|
points.push({
|
||||||
|
"points" => pts,
|
||||||
|
"index" => index,
|
||||||
|
})
|
||||||
|
index += 1
|
||||||
|
end
|
||||||
|
|
||||||
|
# Sort first by points and then by original index of the song
|
||||||
|
points.sort! { |a, b|
|
||||||
|
if b["points"] == a["points"]
|
||||||
|
a["index"] <=> b["index"]
|
||||||
|
else
|
||||||
|
b["points"] <=> a["points"]
|
||||||
|
end
|
||||||
|
}
|
||||||
|
|
||||||
|
return points
|
||||||
|
end
|
||||||
|
|
||||||
|
# SINGULAR COMPONENT OF RANKING ALGORITHM
|
||||||
|
private def compare_timestamps(spotify_metadata : JSON::Any, node : VID_METADATA_CLASS) : Int32
|
||||||
|
# puts spotify_metadata.to_pretty_json()
|
||||||
|
actual_time = spotify_metadata["duration_ms"].as_i
|
||||||
|
vid_time = node["duration_ms"].to_i
|
||||||
|
|
||||||
|
difference = (actual_time - vid_time).abs
|
||||||
|
|
||||||
|
# puts "actual: #{actual_time}, vid: #{vid_time}"
|
||||||
|
# puts "\tdiff: #{difference}"
|
||||||
|
# puts "\ttitle: #{node["title"]}"
|
||||||
|
|
||||||
|
if difference <= 1000
|
||||||
|
return 3
|
||||||
|
elsif difference <= 2000
|
||||||
|
return 2
|
||||||
|
elsif difference <= 5000
|
||||||
|
return 1
|
||||||
|
else
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# SINGULAR COMPONENT OF RANKING ALGORITHM
|
||||||
|
# Returns an `Int` based off the number of points worth assigning to the
|
||||||
|
# matchiness of the string. First the strings are downcased and then all
|
||||||
|
# nonalphanumeric characters are stripped.
|
||||||
|
# If *item1* includes *item2*, return 3 pts.
|
||||||
|
# If after the items have been blanked, *item1* includes *item2*,
|
||||||
|
# return 1 pts.
|
||||||
|
# Else, return 0 pts.
|
||||||
|
private def points_string_compare(item1 : String, item2 : String) : Int32
|
||||||
|
if item2.includes?(item1)
|
||||||
|
return 3
|
||||||
|
end
|
||||||
|
|
||||||
|
item1 = item1.downcase.gsub(/[^a-z0-9]/, "")
|
||||||
|
item2 = item2.downcase.gsub(/[^a-z0-9]/, "")
|
||||||
|
|
||||||
|
if item2.includes?(item1)
|
||||||
|
return 1
|
||||||
|
else
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# SINGULAR COMPONENT OF RANKING ALGORITHM
|
||||||
|
# Checks if there are any phrases in the title of the video that would
|
||||||
|
# indicate audio having what we want.
|
||||||
|
# *video_name* is the title of the video, and *query* is what the user the
|
||||||
|
# program searched for. *query* is needed in order to make sure we're not
|
||||||
|
# subtracting points from something that's naturally in the title
|
||||||
|
private def count_buzzphrases(query : String, video_name : String) : Int32
|
||||||
|
good_phrases = 0
|
||||||
|
bad_phrases = 0
|
||||||
|
|
||||||
|
GOLDEN_PHRASES.each do |gold_phrase|
|
||||||
|
gold_phrase = gold_phrase.downcase.gsub(/[^a-z0-9]/, "")
|
||||||
|
|
||||||
|
if query.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
|
||||||
|
next
|
||||||
|
elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
|
||||||
|
good_phrases += 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
GARBAGE_PHRASES.each do |garbage_phrase|
|
||||||
|
garbage_phrase = garbage_phrase.downcase.gsub(/[^a-z0-9]/, "")
|
||||||
|
|
||||||
|
if query.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
|
||||||
|
next
|
||||||
|
elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
|
||||||
|
bad_phrases += 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return good_phrases - bad_phrases
|
||||||
|
end
|
||||||
|
end
|
|
@ -3,6 +3,8 @@ require "xml"
|
||||||
require "json"
|
require "json"
|
||||||
require "uri"
|
require "uri"
|
||||||
|
|
||||||
|
require "./ranking"
|
||||||
|
|
||||||
|
|
||||||
module Youtube
|
module Youtube
|
||||||
extend self
|
extend self
|
||||||
|
@ -12,17 +14,122 @@ module Youtube
|
||||||
"yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link ",
|
"yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link ",
|
||||||
]
|
]
|
||||||
|
|
||||||
GARBAGE_PHRASES = [
|
# Note that VID_VALUE_CLASS, VID_METADATA_CLASS, and YT_METADATA_CLASS are found in ranking.cr
|
||||||
"cover", "album", "live", "clean", "version", "full", "full album", "row",
|
|
||||||
"at", "@", "session", "how to", "npr music", "reimagined", "hr version",
|
|
||||||
"trailer",
|
|
||||||
]
|
|
||||||
|
|
||||||
GOLDEN_PHRASES = [
|
# Finds a youtube url based off of the given information.
|
||||||
"official video", "official music video",
|
# The query to youtube is constructed like this:
|
||||||
]
|
# "<song_name> <artist_name> <search terms>"
|
||||||
|
# If *download_first* is provided, the first link found will be downloaded.
|
||||||
|
# If *select_link* is provided, a menu of options will be shown for the user to choose their poison
|
||||||
|
#
|
||||||
|
# ```
|
||||||
|
# Youtube.find_url("Bohemian Rhapsody", "Queen")
|
||||||
|
# => "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
||||||
|
# ```
|
||||||
|
def find_url(spotify_metadata : JSON::Any, search_terms = "",
|
||||||
|
download_first = false, select_link = false) : String?
|
||||||
|
|
||||||
alias NODES_CLASS = Array(Hash(String, String))
|
song_name = spotify_metadata["name"].as_s
|
||||||
|
artist_name = spotify_metadata["artists"][0]["name"].as_s
|
||||||
|
|
||||||
|
human_query = song_name + " " + artist_name + " " + search_terms.strip
|
||||||
|
url_query = human_query.gsub(" ", "+")
|
||||||
|
|
||||||
|
url = "https://www.youtube.com/results?search_query=" + url_query
|
||||||
|
|
||||||
|
response = HTTP::Client.get(url)
|
||||||
|
|
||||||
|
yt_metadata = get_yt_search_metadata(response.body)
|
||||||
|
|
||||||
|
if yt_metadata.size == 0
|
||||||
|
puts "There were no results for this query on youtube: \"#{human_query}\""
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
root = "https://youtube.com"
|
||||||
|
|
||||||
|
if download_first
|
||||||
|
return root + yt_metadata[0]["href"]
|
||||||
|
end
|
||||||
|
|
||||||
|
if select_link
|
||||||
|
# return select_link_menu()
|
||||||
|
end
|
||||||
|
|
||||||
|
ranked = Ranker.rank_videos(spotify_metadata, yt_metadata, human_query)
|
||||||
|
|
||||||
|
begin
|
||||||
|
return root + yt_metadata[ranked[0]["index"]]["href"]
|
||||||
|
rescue IndexError
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
exit 1
|
||||||
|
end
|
||||||
|
|
||||||
|
#
|
||||||
|
private def select_link_menu() : String
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
# Finds valid video links from a `HTTP::Client.get` request
|
||||||
|
# Returns an `Array` of `NODES_CLASS` containing additional metadata from Youtube
|
||||||
|
private def get_yt_search_metadata(response_body : String) : YT_METADATA_CLASS
|
||||||
|
yt_initial_data : JSON::Any = JSON.parse("{}")
|
||||||
|
|
||||||
|
response_body.each_line do |line|
|
||||||
|
# timestamp 11/8/2020:
|
||||||
|
# youtube's html page has a line previous to this literally with 'scraper_data_begin' as a comment
|
||||||
|
if line.includes?("var ytInitialData")
|
||||||
|
# Extract JSON data from line
|
||||||
|
data = line.split(" = ")[2].delete(';')
|
||||||
|
dataEnd = (data.index("</script>") || 0) - 1
|
||||||
|
|
||||||
|
begin
|
||||||
|
yt_initial_data = JSON.parse(data[0..dataEnd])
|
||||||
|
rescue
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if yt_initial_data == JSON.parse("{}")
|
||||||
|
puts "Youtube has changed the way it organizes its webpage, submit a bug"
|
||||||
|
puts "saying it has done so on https://github.com/cooperhammond/irs"
|
||||||
|
exit(1)
|
||||||
|
end
|
||||||
|
|
||||||
|
# where the vid metadata lives
|
||||||
|
yt_initial_data = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"]
|
||||||
|
|
||||||
|
video_metadata = [] of VID_METADATA_CLASS
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while true
|
||||||
|
begin
|
||||||
|
# video title
|
||||||
|
raw_metadata = yt_initial_data[0]["itemSectionRenderer"]["contents"][i]["videoRenderer"]
|
||||||
|
|
||||||
|
metadata = {} of String => VID_VALUE_CLASS
|
||||||
|
|
||||||
|
metadata["title"] = raw_metadata["title"]["runs"][0]["text"].as_s
|
||||||
|
metadata["href"] = raw_metadata["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"].as_s
|
||||||
|
timestamp = raw_metadata["lengthText"]["simpleText"].as_s
|
||||||
|
metadata["timestamp"] = timestamp
|
||||||
|
metadata["duration_ms"] = ((timestamp.split(":")[0].to_i * 60 +
|
||||||
|
timestamp.split(":")[1].to_i) * 1000).to_s
|
||||||
|
|
||||||
|
|
||||||
|
video_metadata.push(metadata)
|
||||||
|
rescue IndexError
|
||||||
|
break
|
||||||
|
rescue Exception
|
||||||
|
end
|
||||||
|
i += 1
|
||||||
|
end
|
||||||
|
|
||||||
|
return video_metadata
|
||||||
|
end
|
||||||
|
|
||||||
# Checks if the given URL is a valid youtube URL
|
# Checks if the given URL is a valid youtube URL
|
||||||
#
|
#
|
||||||
|
@ -62,186 +169,4 @@ module Youtube
|
||||||
|
|
||||||
return response.body.includes?("status=ok")
|
return response.body.includes?("status=ok")
|
||||||
end
|
end
|
||||||
|
|
||||||
# Finds a youtube url based off of the given information.
|
|
||||||
# The query to youtube is constructed like this:
|
|
||||||
# "<song_name> <artist_name> <search terms>"
|
|
||||||
# If *download_first* is provided, the first link found will be downloaded.
|
|
||||||
#
|
|
||||||
# ```
|
|
||||||
# Youtube.find_url("Bohemian Rhapsody", "Queen")
|
|
||||||
# => "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
|
||||||
# ```
|
|
||||||
def find_url(song_name : String, artist_name : String, search_terms = "",
|
|
||||||
download_first = false) : String?
|
|
||||||
query = (song_name + " " + artist_name + " " + search_terms).strip.gsub(" ", "+")
|
|
||||||
|
|
||||||
url = "https://www.youtube.com/results?search_query=" + query
|
|
||||||
|
|
||||||
response = HTTP::Client.get(url)
|
|
||||||
|
|
||||||
valid_nodes = get_video_link_nodes(response.body)
|
|
||||||
|
|
||||||
if valid_nodes.size == 0
|
|
||||||
puts "There were no results for that query."
|
|
||||||
return nil
|
|
||||||
end
|
|
||||||
|
|
||||||
root = "https://youtube.com"
|
|
||||||
|
|
||||||
return root + valid_nodes[0]["href"] if download_first
|
|
||||||
|
|
||||||
ranked = rank_videos(song_name, artist_name, query, valid_nodes)
|
|
||||||
|
|
||||||
begin
|
|
||||||
return root + valid_nodes[ranked[0]["index"]]["href"]
|
|
||||||
rescue IndexError
|
|
||||||
return nil
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# Will rank videos according to their title and the user input
|
|
||||||
# Return:
|
|
||||||
# [
|
|
||||||
# {"points" => x, "index" => x},
|
|
||||||
# ...
|
|
||||||
# ]
|
|
||||||
private def rank_videos(song_name : String, artist_name : String,
|
|
||||||
query : String, nodes : Array(Hash(String, String))) : Array(Hash(String, Int32))
|
|
||||||
points = [] of Hash(String, Int32)
|
|
||||||
index = 0
|
|
||||||
|
|
||||||
nodes.each do |node|
|
|
||||||
pts = 0
|
|
||||||
|
|
||||||
pts += points_compare(song_name, node["title"])
|
|
||||||
pts += points_compare(artist_name, node["title"])
|
|
||||||
pts += count_buzzphrases(query, node["title"])
|
|
||||||
|
|
||||||
points.push({
|
|
||||||
"points" => pts,
|
|
||||||
"index" => index,
|
|
||||||
})
|
|
||||||
index += 1
|
|
||||||
end
|
|
||||||
|
|
||||||
# Sort first by points and then by original index of the song
|
|
||||||
points.sort! { |a, b|
|
|
||||||
if b["points"] == a["points"]
|
|
||||||
a["index"] <=> b["index"]
|
|
||||||
else
|
|
||||||
b["points"] <=> a["points"]
|
|
||||||
end
|
|
||||||
}
|
|
||||||
|
|
||||||
return points
|
|
||||||
end
|
|
||||||
|
|
||||||
# Returns an `Int` based off the number of points worth assigning to the
|
|
||||||
# matchiness of the string. First the strings are downcased and then all
|
|
||||||
# nonalphanumeric characters are stripped.
|
|
||||||
# If *item1* includes *item2*, return 3 pts.
|
|
||||||
# If after the items have been blanked, *item1* includes *item2*,
|
|
||||||
# return 1 pts.
|
|
||||||
# Else, return 0 pts.
|
|
||||||
private def points_compare(item1 : String, item2 : String) : Int32
|
|
||||||
if item2.includes?(item1)
|
|
||||||
return 3
|
|
||||||
end
|
|
||||||
|
|
||||||
item1 = item1.downcase.gsub(/[^a-z0-9]/, "")
|
|
||||||
item2 = item2.downcase.gsub(/[^a-z0-9]/, "")
|
|
||||||
|
|
||||||
if item2.includes?(item1)
|
|
||||||
return 1
|
|
||||||
else
|
|
||||||
return 0
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# Checks if there are any phrases in the title of the video that would
|
|
||||||
# indicate audio having what we want.
|
|
||||||
# *video_name* is the title of the video, and *query* is what the user the
|
|
||||||
# program searched for. *query* is needed in order to make sure we're not
|
|
||||||
# subtracting points from something that's naturally in the title
|
|
||||||
private def count_buzzphrases(query : String, video_name : String) : Int32
|
|
||||||
good_phrases = 0
|
|
||||||
bad_phrases = 0
|
|
||||||
|
|
||||||
GOLDEN_PHRASES.each do |gold_phrase|
|
|
||||||
gold_phrase = gold_phrase.downcase.gsub(/[^a-z0-9]/, "")
|
|
||||||
|
|
||||||
if query.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
|
|
||||||
next
|
|
||||||
elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(gold_phrase)
|
|
||||||
good_phrases += 1
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
GARBAGE_PHRASES.each do |garbage_phrase|
|
|
||||||
garbage_phrase = garbage_phrase.downcase.gsub(/[^a-z0-9]/, "")
|
|
||||||
|
|
||||||
if query.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
|
|
||||||
next
|
|
||||||
elsif video_name.downcase.gsub(/[^a-z0-9]/, "").includes?(garbage_phrase)
|
|
||||||
bad_phrases += 1
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
return good_phrases - bad_phrases
|
|
||||||
end
|
|
||||||
|
|
||||||
# Finds valid video links from a `HTTP::Client.get` request
|
|
||||||
# Returns an `Array` of `XML::Node`
|
|
||||||
private def get_video_link_nodes(response_body : String) : NODES_CLASS
|
|
||||||
yt_initial_data : JSON::Any = JSON.parse("{}")
|
|
||||||
|
|
||||||
response_body.each_line do |line|
|
|
||||||
# timestamp 11/8/2020:
|
|
||||||
# youtube's html page has a line previous to this literally with 'scraper_data_begin' as a comment
|
|
||||||
if line.includes?("var ytInitialData")
|
|
||||||
# Extract JSON data from line
|
|
||||||
data = line.split(" = ")[2].delete(';')
|
|
||||||
dataEnd = (data.index("</script>") || 0) - 1
|
|
||||||
|
|
||||||
begin
|
|
||||||
yt_initial_data = JSON.parse(data[0..dataEnd])
|
|
||||||
rescue
|
|
||||||
break
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
if yt_initial_data == JSON.parse("{}")
|
|
||||||
puts "Youtube has changed the way it organizes its webpage, submit a bug"
|
|
||||||
puts "saying it has done so on https://github.com/cooperhammond/irs"
|
|
||||||
exit(1)
|
|
||||||
end
|
|
||||||
|
|
||||||
# where the vid metadata lives
|
|
||||||
yt_initial_data = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"]
|
|
||||||
|
|
||||||
video_metadata = [] of Hash(String, String)
|
|
||||||
|
|
||||||
i = 0
|
|
||||||
while true
|
|
||||||
begin
|
|
||||||
# video title
|
|
||||||
raw_metadata = yt_initial_data[0]["itemSectionRenderer"]["contents"][i]["videoRenderer"]
|
|
||||||
|
|
||||||
metadata = {} of String => String
|
|
||||||
|
|
||||||
metadata["title"] = raw_metadata["title"]["runs"][0]["text"].as_s
|
|
||||||
metadata["href"] = raw_metadata["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"].as_s
|
|
||||||
|
|
||||||
video_metadata.push(metadata)
|
|
||||||
rescue IndexError
|
|
||||||
break
|
|
||||||
rescue Exception
|
|
||||||
end
|
|
||||||
i += 1
|
|
||||||
end
|
|
||||||
|
|
||||||
return video_metadata
|
|
||||||
end
|
|
||||||
end
|
end
|
Loading…
Reference in a new issue