filters-maker/crawler.py

121 lines
4.3 KiB
Python
Raw Normal View History

2021-07-28 14:03:52 +00:00
import os
import requests
import re
2021-08-02 15:25:59 +00:00
def clear_old_files(incoming):
2021-08-02 11:50:11 +00:00
try:
2021-08-02 14:00:05 +00:00
os.remove(incoming)
2021-08-02 11:50:11 +00:00
except OSError:
pass
2021-08-02 13:47:46 +00:00
2021-08-02 15:25:59 +00:00
def download_filters(url,incoming):
2021-07-28 14:03:52 +00:00
print("downloading: ",url)
get = requests.get(url)
if get.status_code == requests.codes.ok:
2021-08-02 14:00:05 +00:00
with open(incoming, 'ab') as f:
2021-07-28 14:03:52 +00:00
for data in get:
f.write(data)
return url
def filtering(filters_welcome):
2021-08-11 05:21:31 +00:00
unwanted = ['#',';','@','$',' NS',' NS','@@||','!']
2021-07-28 14:03:52 +00:00
print("filtering . . .")
with open(filters_welcome, 'r') as f:
lines = f.read().splitlines()
with open(filters_welcome, 'w') as f:
for line in lines:
2021-08-11 05:21:31 +00:00
if not line.startswith((tuple(unwanted))) and line.strip():
2021-08-03 11:56:08 +00:00
f.write('\n'.join([line + '\n']))
2021-07-28 14:03:52 +00:00
print("++ successful!")
f.close()
def filteringcon(filters_regex_one):
print("filtering . . .")
with open(filters_regex_one) as f:
file = f.read().split('\n')
for i in range(len(file)):
file[i] = re.sub('\s\s+#.*', '', file[i])
2021-07-28 14:03:52 +00:00
file[i] = re.sub(' CNAME .$', '', file[i])
file[i] = re.sub(' CNAME . $', '', file[i])
2021-08-12 04:13:21 +00:00
file[i] = re.sub('^\*.', '', file[i])
2021-08-14 15:04:57 +00:00
file[i] = re.sub('\s\s+', ' ', file[i])
file[i] = re.sub('#..*', '', file[i])
file[i] = re.sub('CNAME . ;..*', '', file[i])
file[i] = re.sub(';..*', '', file[i])
file[i] = re.sub('\A^\.' ,'' ,file[i])
2021-07-28 14:03:52 +00:00
with open(filters_regex_one, 'w') as f1:
f1.writelines(["%s\n" % item for item in file])
print("++ successful!")
f.close()
2021-08-14 14:50:43 +00:00
with open(filters_regex_one) as f:
file = f.read().split('\n')
for i in range(len(file)):
file[i] = re.sub('0\.0\.0\.0 0\.0\.0\.0\Z', '' ,file[i])
file[i] = re.sub('\A127\.0\.0\.1 ', '', file[i])
file[i] = re.sub('\A0\.0\.0\.0 ', '', file[i])
file[i] = re.sub('\A0 ', '', file[i])
file[i] = re.sub('\A:: ', '', file[i])
file[i] = re.sub('\A::1 ', '' ,file[i])
2021-08-14 15:04:57 +00:00
file[i] = re.sub('^\A\|\|', '' ,file[i])
file[i] = re.sub('\^$\Z', '' ,file[i])
file[i] = re.sub('^\|' ,'' ,file[i])
2021-08-14 14:50:43 +00:00
file[i] = re.sub(r'#', ';', file[i])
with open(filters_regex_one, 'w') as f1:
f1.writelines(["%s\n" % item for item in file])
f.close()
2021-08-02 08:15:42 +00:00
2021-08-12 06:45:17 +00:00
remove_words = ['localhost','localhost.localdomain','local','broadcasthost','loopback','ip6-localnet','ip6-mcastprefix','ip6-allnodes','ip6-allrouters','ip6-allhosts','ip6-loopback',' CNAME rpz-passthru.']
2021-08-08 04:24:35 +00:00
with open(filters_regex_one, 'r') as f:
lines = f.read().splitlines()
with open(filters_regex_one, 'w') as f:
for line in lines:
if not line.endswith((tuple(remove_words))):
2021-08-08 04:24:35 +00:00
f.write('\n'.join([line + '\n']))
2021-08-14 15:04:57 +00:00
f.close()
2021-08-18 05:15:39 +00:00
with open(filters_regex_one, 'r') as f:
lines = f.read().splitlines()
with open(filters_regex_one, 'w') as f:
for line in lines:
2021-08-18 05:19:06 +00:00
f.write('\n'.join(line.split()))
2021-08-08 04:24:35 +00:00
2021-07-28 14:03:52 +00:00
def killingdup(duplicated_file):
print('Getting rid of duplicated line')
with open(duplicated_file, 'r') as f:
lines = set(f.readlines())
with open(duplicated_file, 'w') as f:
f.writelines(set(lines))
2021-07-28 14:03:52 +00:00
print("++ successful!")
f.close()
2021-08-02 15:37:53 +00:00
def excluded(excluded ,incoming):
2021-08-11 05:21:31 +00:00
exline = [';','$','@',' IN']
2021-08-02 15:30:49 +00:00
with open(excluded ,'r') as f:
exclude = f.read().split()
with open(incoming ,'r') as f:
lines = f.read().splitlines() # read lines
2021-08-02 15:34:51 +00:00
with open(incoming ,'w') as f:
2021-08-02 15:30:49 +00:00
for line in lines:
if line.strip() and not line in exclude and not line.startswith(';'):
2021-08-02 15:49:36 +00:00
f.write('\n'.join([line + '\n']))
2021-08-11 05:21:31 +00:00
elif line.startswith((tuple(exline))):
2021-08-02 15:30:49 +00:00
f.write('\n'.join([line + '\n']))
elif not line.strip():
f.write('\n'.join([line + '\n']))
def blankremover(incoming):
with open(incoming ,'r') as f:
lines = f.read().split()
with open(incoming ,'w') as f:
for line in lines:
if line.strip():
f.write('\n'.join([line + '\n']))
2021-08-03 11:56:08 +00:00
def sort(incoming):
with open(incoming, 'r') as f:
lines = sorted(f.readlines())
with open(incoming, 'w') as f:
for line in lines:
2021-08-05 13:35:22 +00:00
f.write(line)