Skip to content

Commit

Permalink
chg: [modules] crawl pasties domains
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed Aug 21, 2023
1 parent f05c7b6 commit 0cb7431
Show file tree
Hide file tree
Showing 6 changed files with 177 additions and 72 deletions.
1 change: 1 addition & 0 deletions bin/lib/ConfigLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def get_all_keys_values_from_section(self, section):
else:
return []


# # # # Directory Config # # # #

config_loader = ConfigLoader()
Expand Down
28 changes: 28 additions & 0 deletions bin/lib/regex_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,34 @@ def regex_finditer(r_key, regex, item_id, content, max_time=30):
proc.terminate()
sys.exit(0)

def _regex_match(r_key, regex, content):
if re.match(regex, content):
r_serv_cache.set(r_key, 1)
r_serv_cache.expire(r_key, 360)

def regex_match(r_key, regex, item_id, content, max_time=30):
proc = Proc(target=_regex_match, args=(r_key, regex, content))
try:
proc.start()
proc.join(max_time)
if proc.is_alive():
proc.terminate()
# Statistics.incr_module_timeout_statistic(r_key)
err_mess = f"{r_key}: processing timeout: {item_id}"
logger.info(err_mess)
return False
else:
if r_serv_cache.exists(r_key):
r_serv_cache.delete(r_key)
return True
else:
r_serv_cache.delete(r_key)
return False
except KeyboardInterrupt:
print("Caught KeyboardInterrupt, terminating regex worker")
proc.terminate()
sys.exit(0)

def _regex_search(r_key, regex, content):
if re.search(regex, content):
r_serv_cache.set(r_key, 1)
Expand Down
144 changes: 144 additions & 0 deletions bin/modules/Pasties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The Pasties Module
======================
This module spots domain-pasties services for further processing
"""

##################################
# Import External packages
##################################
import os
import sys
import time

from pyfaup.faup import Faup

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from modules.abstract_module import AbstractModule
from lib.ConfigLoader import ConfigLoader
from lib import crawlers

# TODO add url validator

pasties_blocklist_urls = set()
pasties_domains = {}

class Pasties(AbstractModule):
"""
Pasties module for AIL framework
"""

def __init__(self):
super(Pasties, self).__init__()
self.faup = Faup()

config_loader = ConfigLoader()
self.r_cache = config_loader.get_redis_conn("Redis_Cache")

self.pasties = {}
self.urls_blocklist = set()
self.load_pasties_domains()

# Send module state to logs
self.logger.info(f'Module {self.module_name} initialized')

def load_pasties_domains(self):
self.pasties = {}
self.urls_blocklist = set()

domains_pasties = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties')
if os.path.exists(domains_pasties):
with open(domains_pasties) as f:
for line in f:
url = line.strip()
if url: # TODO validate line
self.faup.decode(url)
url_decoded = self.faup.get()
host = url_decoded['host']
# if url_decoded.get('port', ''):
# host = f'{host}:{url_decoded["port"]}'
path = url_decoded.get('resource_path', '')
# print(url_decoded)
if path and path != '/':
if path[-1] != '/':
path = f'{path}/'
else:
path = None

if host in self.pasties:
if path:
self.pasties[host].add(path)
else:
if path:
self.pasties[host] = {path}
else:
self.pasties[host] = set()

url_blocklist = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties_blacklist')
if os.path.exists(url_blocklist):
with open(url_blocklist) as f:
for line in f:
url = line.strip()
self.faup.decode(url)
url_decoded = self.faup.get()
host = url_decoded['host']
# if url_decoded.get('port', ''):
# host = f'{host}:{url_decoded["port"]}'
path = url_decoded.get('resource_path', '')
url = f'{host}{path}'
if url_decoded['query_string']:
url = url + url_decoded['query_string']
self.urls_blocklist.add(url)

def send_to_crawler(self, url, obj_id):
if not self.r_cache.exists(f'{self.module_name}:url:{url}'):
self.r_cache.set(f'{self.module_name}:url:{url}', int(time.time()))
self.r_cache.expire(f'{self.module_name}:url:{url}', 86400)
crawlers.create_task(url, depth=0, har=False, screenshot=False, proxy='force_tor', priority=60, parent=obj_id)

def compute(self, message):
url, item_id = message.split()

self.faup.decode(url)
url_decoded = self.faup.get()
# print(url_decoded)
url_host = url_decoded['host']
# if url_decoded.get('port', ''):
# url_host = f'{url_host}:{url_decoded["port"]}'
path = url_decoded.get('resource_path', '')
if url_host in self.pasties:
if url.startswith('http://'):
if url[7:] in self.urls_blocklist:
return None
elif url.startswith('https://'):
if url[8:] in self.urls_blocklist:
return None
else:
if url in self.urls_blocklist:
return None

if not self.pasties[url_host]:
if path and path != '/':
print('send to crawler', url_host, url)
self.send_to_crawler(url, item_id)
else:
if path.endswith('/'):
path_end = path[:-1]
else:
path_end = f'{path}/'
for url_path in self.pasties[url_host]:
if path.startswith(url_path):
if url_path != path and url_path != path_end:
print('send to crawler', url_path, url)
self.send_to_crawler(url, item_id)
break


if __name__ == '__main__':
module = Pasties()
module.run()
71 changes: 0 additions & 71 deletions bin/modules/Zerobins.py

This file was deleted.

3 changes: 3 additions & 0 deletions bin/modules/abstract_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ def add_message_to_queue(self, message, queue_name=None):
def get_available_queues(self):
return self.queue.get_out_queues()

def regex_match(self, regex, obj_id, content):
return regex_helper.regex_match(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time)

def regex_search(self, regex, obj_id, content):
return regex_helper.regex_search(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time)

Expand Down
2 changes: 1 addition & 1 deletion configs/modules.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ publish = Importers,Tags
subscribe = Item
publish = Tags

[Zerobins]
[Pasties]
subscribe = Url

# [My_Module_Name]
Expand Down

0 comments on commit 0cb7431

Please sign in to comment.