mirror of
https://github.com/2004content/rarbg.git
synced 2025-01-18 15:07:09 +00:00
Update clean.py
This commit is contained in:
parent
9c86226fdd
commit
6974148651
95
clean.py
95
clean.py
|
@ -1,70 +1,69 @@
|
||||||
|
import string
|
||||||
|
|
||||||
def fix(line, data):
|
def fix(line, data):
|
||||||
try:
|
try:
|
||||||
hash = line[20:[pos for pos, char in enumerate(line) if char == '&'][0]].lower()#hash is end of prefix to first '&', lowercased
|
hash_end = line.find('&')
|
||||||
except:#if no '&dn='
|
hash = line[20:hash_end].lower() # hash is end of prefix to first '&', lowercased
|
||||||
|
except:
|
||||||
hash = line[20:]
|
hash = line[20:]
|
||||||
line = line + '&dn='
|
line = line + '&dn='
|
||||||
|
|
||||||
try:
|
try:
|
||||||
int(hash, 16)#check if hash is hexadecimal
|
int(hash, 16) # check if hash is hexadecimal
|
||||||
except:
|
except ValueError:
|
||||||
return
|
return
|
||||||
if line.count('&') > 1:#look for trackers
|
|
||||||
location = 0
|
positions = [pos for pos, char in enumerate(line) if char in ('&', '=', 'n')]
|
||||||
tocheck = []
|
if len(positions) > 1: # look for trackers
|
||||||
while location < len(line):#find all occurences of '&'
|
title_start = positions[positions.index('=') + 1] + 1
|
||||||
location = line.find('&', location)
|
title_end = positions[positions.index('&', 1) + 1]
|
||||||
if location == -1:
|
title = line[title_start:title_end] # title is second '=' to second '&'
|
||||||
break
|
|
||||||
tocheck.append(location)
|
|
||||||
location += 1
|
|
||||||
for index in tocheck:#iterate through occurences of '&'
|
|
||||||
try:
|
|
||||||
if (line[index + 1] == 't') and (line[index + 2] == 'r') and (line[index + 3] == '='):#if occurence is part of a tracker then ignore
|
|
||||||
pass
|
|
||||||
else:#if not, it's part of the title so replace it
|
|
||||||
line = line[:index] + line[index + 1:]
|
|
||||||
except IndexError:
|
|
||||||
line = line[:index] + line[index + 1:]
|
|
||||||
if line.count('&') > 1:#if it actually has only trackers now
|
|
||||||
title = line[[pos for pos, char in enumerate(line) if char == '='][1] + 1:[pos for pos, char in enumerate(line) if char == '&'][1]]#title is second '=' to second '&'
|
|
||||||
else:
|
|
||||||
title = line[[pos for pos, char in enumerate(line) if char == '='][1] + 1:]#title is second '=' to end if no trackers
|
|
||||||
else:
|
else:
|
||||||
title = line[[pos for pos, char in enumerate(line) if char == '='][1] + 1:]#title is second '=' to end if no trackers
|
title = line[positions[0] + 1:] # title is second '=' to end if no trackers
|
||||||
title = ''.join(char for char in title if ord(char) < 128)#strip non-ascii characters
|
|
||||||
|
title = ''.join(char for char in title if ord(char) < 128) # strip non-ascii characters
|
||||||
|
|
||||||
linesplit = ['magnet:?xt=urn:btih:', hash, '&dn=', title]
|
linesplit = ['magnet:?xt=urn:btih:', hash, '&dn=', title]
|
||||||
data.append(linesplit)
|
data.append(linesplit)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
data = []#lists within list
|
data = [] # lists within list
|
||||||
with open('everything.txt', encoding='utf-8') as file:#open file
|
with open('everything.txt', encoding='utf-8') as file:
|
||||||
for line in file:
|
for line in file:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line.startswith('magnet:?xt=urn:btih:'):#check for validity
|
if line.startswith('magnet:?xt=urn:btih:'):
|
||||||
if 'magnet:?xt=urn:btih:' in line[20:]:#check for paste errors on my part
|
if 'magnet:?xt=urn:btih:' in line[20:]:
|
||||||
secondline = line[line.find('magnet:?xt=urn:btih:', 20):]#the second magnet link in this line
|
secondline = line[line.find('magnet:?xt=urn:btih:', 20):]
|
||||||
line = line[:line.find('magnet:?xt=urn:btih:', 20)]#the first magnet link in this line
|
line = line[:line.find('magnet:?xt=urn:btih:', 20)]
|
||||||
data = fix(secondline, data)#go ahead and add the second to data
|
data = fix(secondline, data)
|
||||||
if 'magnetxturnbtih' in line[20:]:#paste errors that got symbols removed (and 'd' after the first '&', for some reason)
|
|
||||||
hash = line[line.find('magnetxturnbtih', 20) + 15:line.find('n', line.find('magnetxturnbtih', 20) + 15)]#pull just the hash of the second magnet, which stretches from the end of the magnet prefix to the first occurrence of 'n' past the prefix
|
if 'magnetxturnbtih' in line[20:]:
|
||||||
title = line[line.find('n', line.find('magnetxturnbtih', 20) + 15) + 1:]#title stretches from that 'n' to the end (any trackers will be stripped out later)
|
hash_start = line.find('magnetxturnbtih', 20) + 15
|
||||||
secondline = 'magnet:?xt=urn:btih:' + hash + '&dn=' + title#put it back together
|
hash_end = line.find('n', hash_start)
|
||||||
|
title_start = hash_end + 1
|
||||||
|
hash = line[hash_start:hash_end] # pull just the hash of the second magnet
|
||||||
|
title = line[title_start:] # title stretches from that 'n' to the end
|
||||||
|
secondline = 'magnet:?xt=urn:btih:' + hash + '&dn=' + title
|
||||||
line = line[:line.find('magnetxturnbtih', 20)]
|
line = line[:line.find('magnetxturnbtih', 20)]
|
||||||
data = fix(secondline, data)
|
data = fix(secondline, data)
|
||||||
data = fix(line, data)#add split line to data
|
|
||||||
|
data = fix(line, data)
|
||||||
|
|
||||||
|
symbols = string.punctuation.replace('.', '').replace('-', '')
|
||||||
|
translation_table = str.maketrans('', '', symbols)
|
||||||
|
|
||||||
for magnet in data:
|
for magnet in data:
|
||||||
for character in ['`', '~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+', '=', '[', '{', ']', '}', '\\', '|', ';', ':', '\'', '\"', ',', '<', '>', '?', '/']:
|
magnet[3] = magnet[3].translate(translation_table)
|
||||||
magnet[3] = magnet[3].replace(character, '')#get rid of symbols except '.' and '-'
|
magnet[3] = magnet[3].replace(' ', '.')
|
||||||
magnet[3] = magnet[3].replace(' ', '.')#replace spaces
|
|
||||||
|
|
||||||
dic = {}#dictionary to eliminate duplicate hashes
|
hashes = set()
|
||||||
for i in sorted(data, key=lambda x: x[3]):#sorted data because it lets me replace null titles because the last duplicate keeps the title and nulls are listed first in sort
|
|
||||||
dic[i[0] + i[1]] = i[2] + i[3]
|
|
||||||
results = []
|
results = []
|
||||||
for value in sorted(dic, key=dic.get):#sort dictionary
|
for magnet in data:
|
||||||
results.append('{}{}'.format(value, dic[value]))
|
magnet_key = magnet[0] + magnet[1]
|
||||||
|
if magnet_key not in hashes:
|
||||||
|
hashes.add(magnet_key)
|
||||||
|
results.append('{}{}'.format(magnet_key, magnet[2] + magnet[3]))
|
||||||
|
|
||||||
with open('output.txt', 'a', encoding='utf-8') as output:
|
with open('output.txt', 'a', encoding='utf-8') as output:
|
||||||
for i in results:
|
for i in results:
|
||||||
output.write(i + '\n')
|
output.write(i + '\n')
|
||||||
|
|
Loading…
Reference in a new issue