filters-maker/crawler.py

import os
import sys
import requests
import re

print('starting . . . ')

def clear_old_files(incoming):
    try:
        os.remove(incoming)
    except OSError:
        pass

def download_filters(url,incoming):
    print("downloading: ",url)
      
    get = requests.get(url)
    if get.status_code == requests.codes.ok:
         with open(incoming, 'ab') as f:
          for data in get:
           f.write(data)
    return url

def filtering(filters_welcome):
    print("filtering . . .")
    with open(filters_welcome, 'r') as f:
        lines = f.read().splitlines()
    with open(filters_welcome, 'w') as f:
        for line in lines:
             if not line.startswith(('#',';','@','$','  NS','@@||','!')) and line.strip():
              f.write('\n'.join([line + '\n']))
        print("++ successful!")
        f.close()

def filteringcon(filters_regex_one):
    print("filtering . . .")
    with open(filters_regex_one) as f:
         file = f.read().split('\n')
         for i in range(len(file)):
             file[i] = re.sub(';.*', '', file[i])
             file[i] = re.sub(' CNAME .$', '', file[i])
             file[i] = re.sub(' CNAME . $', '', file[i])
    with open(filters_regex_one, 'w') as f1:
         f1.writelines(["%s\n" % item  for item in file])
    print("++ successful!")
    f.close()

    a = ['||','^','|']
    lst = []
    with open(filters_regex_one, 'r') as f:
        for line in f:
            for word in a:
                if word in line:
                    line = line.replace(word,'')
            lst.append(line)
    f.close()
    with open(filters_regex_one, 'w') as f:
        for line in lst:
            f.write(line)
    f.close()

def killingdup(duplicated_file):
    print('Getting rid of duplicated line')
    with open(duplicated_file, 'r') as f:
        lines = set(f.readlines())
    with open(duplicated_file, 'w') as f:
          f.writelines(set(lines))
    print("++ successful!")
    f.close()

def excluded(excluded_file ,incoming):
    with open(excluded ,'r') as f:
        exclude = f.read().split()
    with open(incoming ,'r') as f:
        lines = f.read().splitlines() # read lines
    with open(incoming ,'w') as f:
        for line in lines:
            if line.strip() and not line in exclude and not line.startswith(';'):
               f.write('\n'.join([line + ' CNAME .\n']))
            elif line.startswith((';','$','@','  IN')):
               f.write('\n'.join([line + '\n']))
            elif not line.strip():
               f.write('\n'.join([line + '\n']))

if __name__ == "__main__":
    download_filters('https://filters.kylz.nl/RPZ/someonewhocares/rpz.txt','test.txt')
    filtering('test.txt')
    filteringcon('test.txt')
    killingdup('test.txt')
init 2021-07-28 14:03:52 +00:00			`import os`
			`import sys`
			`import requests`
			`import re`

			`print('starting . . . ')`

Update 'crawler.py' 2021-08-02 15:25:59 +00:00			`def clear_old_files(incoming):`
Hype Hype 2021-08-02 11:50:11 +00:00			`try:`
Fixing Attempt v2-2 2021-08-02 14:00:05 +00:00			`os.remove(incoming)`
Hype Hype 2021-08-02 11:50:11 +00:00			`except OSError:`
			`pass`
modi 2021-08-02 13:47:46 +00:00
Update 'crawler.py' 2021-08-02 15:25:59 +00:00			`def download_filters(url,incoming):`
init 2021-07-28 14:03:52 +00:00			`print("downloading: ",url)`

			`get = requests.get(url)`
			`if get.status_code == requests.codes.ok:`
Fixing Attempt v2-2 2021-08-02 14:00:05 +00:00			`with open(incoming, 'ab') as f:`
init 2021-07-28 14:03:52 +00:00			`for data in get:`
			`f.write(data)`
			`return url`

			`def filtering(filters_welcome):`
			`print("filtering . . .")`
			`with open(filters_welcome, 'r') as f:`
			`lines = f.read().splitlines()`
			`with open(filters_welcome, 'w') as f:`
			`for line in lines:`
Support for AdBlock Format 2021-08-02 08:15:42 +00:00			`if not line.startswith(('#',';','@','$',' NS','@@\|\|','!')) and line.strip():`
init 2021-07-28 14:03:52 +00:00			`f.write('\n'.join([line + '\n']))`
			`print("++ successful!")`
			`f.close()`

			`def filteringcon(filters_regex_one):`
			`print("filtering . . .")`
			`with open(filters_regex_one) as f:`
			`file = f.read().split('\n')`
			`for i in range(len(file)):`
			`file[i] = re.sub(';.*', '', file[i])`
			`file[i] = re.sub(' CNAME .$', '', file[i])`
			`file[i] = re.sub(' CNAME . $', '', file[i])`
			`with open(filters_regex_one, 'w') as f1:`
			`f1.writelines(["%s\n" % item for item in file])`
			`print("++ successful!")`
			`f.close()`

Support for AdBlock Format 2021-08-02 08:15:42 +00:00			`a = ['\|\|','^','\|']`
			`lst = []`
			`with open(filters_regex_one, 'r') as f:`
			`for line in f:`
			`for word in a:`
			`if word in line:`
			`line = line.replace(word,'')`
			`lst.append(line)`
			`f.close()`
			`with open(filters_regex_one, 'w') as f:`
			`for line in lst:`
			`f.write(line)`
			`f.close()`

init 2021-07-28 14:03:52 +00:00			`def killingdup(duplicated_file):`
			`print('Getting rid of duplicated line')`
			`with open(duplicated_file, 'r') as f:`
			`lines = set(f.readlines())`
			`with open(duplicated_file, 'w') as f:`
			`f.writelines(set(lines))`
			`print("++ successful!")`
			`f.close()`

Update 'crawler.py' 2021-08-02 15:30:49 +00:00			`def excluded(excluded_file ,incoming):`
			`with open(excluded ,'r') as f:`
			`exclude = f.read().split()`
			`with open(incoming ,'r') as f:`
			`lines = f.read().splitlines() # read lines`
Update 'crawler.py' 2021-08-02 15:34:51 +00:00			`with open(incoming ,'w') as f:`
Update 'crawler.py' 2021-08-02 15:30:49 +00:00			`for line in lines:`
			`if line.strip() and not line in exclude and not line.startswith(';'):`
			`f.write('\n'.join([line + ' CNAME .\n']))`
			`elif line.startswith((';','$','@',' IN')):`
			`f.write('\n'.join([line + '\n']))`
			`elif not line.strip():`
			`f.write('\n'.join([line + '\n']))`

Fixing Attempt v2-2 2021-08-02 14:00:05 +00:00			`if __name__ == "__main__":`
			`download_filters('https://filters.kylz.nl/RPZ/someonewhocares/rpz.txt','test.txt')`
			`filtering('test.txt')`
			`filteringcon('test.txt')`
			`killingdup('test.txt')`