filters-maker/crawler.py

75 lines
2.1 KiB
Python
Raw Normal View History

2021-07-28 14:03:52 +00:00
import os
import sys
import requests
import re
print('starting . . . ')
2021-08-02 15:25:59 +00:00
def clear_old_files(incoming):
2021-08-02 11:50:11 +00:00
try:
2021-08-02 14:00:05 +00:00
os.remove(incoming)
2021-08-02 11:50:11 +00:00
except OSError:
pass
2021-08-02 13:47:46 +00:00
2021-08-02 15:25:59 +00:00
def download_filters(url,incoming):
2021-07-28 14:03:52 +00:00
print("downloading: ",url)
get = requests.get(url)
if get.status_code == requests.codes.ok:
2021-08-02 14:00:05 +00:00
with open(incoming, 'ab') as f:
2021-07-28 14:03:52 +00:00
for data in get:
f.write(data)
return url
def filtering(filters_welcome):
print("filtering . . .")
with open(filters_welcome, 'r') as f:
lines = f.read().splitlines()
with open(filters_welcome, 'w') as f:
for line in lines:
2021-08-02 08:15:42 +00:00
if not line.startswith(('#',';','@','$',' NS','@@||','!')) and line.strip():
2021-07-28 14:03:52 +00:00
f.write('\n'.join([line + '\n']))
print("++ successful!")
f.close()
def filteringcon(filters_regex_one):
print("filtering . . .")
with open(filters_regex_one) as f:
file = f.read().split('\n')
for i in range(len(file)):
file[i] = re.sub(';.*', '', file[i])
file[i] = re.sub(' CNAME .$', '', file[i])
file[i] = re.sub(' CNAME . $', '', file[i])
with open(filters_regex_one, 'w') as f1:
f1.writelines(["%s\n" % item for item in file])
print("++ successful!")
f.close()
2021-08-02 08:15:42 +00:00
a = ['||','^','|']
lst = []
with open(filters_regex_one, 'r') as f:
for line in f:
for word in a:
if word in line:
line = line.replace(word,'')
lst.append(line)
f.close()
with open(filters_regex_one, 'w') as f:
for line in lst:
f.write(line)
f.close()
2021-07-28 14:03:52 +00:00
def killingdup(duplicated_file):
print('Getting rid of duplicated line')
with open(duplicated_file, 'r') as f:
lines = set(f.readlines())
with open(duplicated_file, 'w') as f:
f.writelines(set(lines))
print("++ successful!")
f.close()
2021-08-02 14:00:05 +00:00
if __name__ == "__main__":
download_filters('https://filters.kylz.nl/RPZ/someonewhocares/rpz.txt','test.txt')
filtering('test.txt')
filteringcon('test.txt')
killingdup('test.txt')