Skip to content

Commit

Permalink
chg: [dom-hash] add dom-hash object compute dom-hash for domains and …
Browse files Browse the repository at this point in the history
…crawled items
  • Loading branch information
Terrtia committed Oct 17, 2024
1 parent 35dd487 commit b988f46
Show file tree
Hide file tree
Showing 15 changed files with 1,141 additions and 6 deletions.
6 changes: 6 additions & 0 deletions bin/crawlers/Crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from lib.objects import CookiesNames
from lib.objects import Etags
from lib.objects.Domains import Domain
from lib.objects import DomHashs
from lib.objects import Favicons
from lib.objects.Items import Item
from lib.objects import Screenshots
Expand Down Expand Up @@ -348,6 +349,11 @@ def save_capture_response(self, parent_id, entries):
self.root_item = item_id
parent_id = item_id

# DOM-HASH
dom_hash = DomHashs.create(entries['html'])
dom_hash.add(self.date.replace('/', ''), item)
dom_hash.add_correlation('domain', '', self.domain.id)

title_content = crawlers.extract_title_from_html(entries['html'])
if title_content:
title = Titles.create_title(title_content)
Expand Down
8 changes: 4 additions & 4 deletions bin/lib/ail_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
config_loader = None

AIL_OBJECTS = sorted({'chat', 'chat-subchannel', 'chat-thread', 'cookie-name', 'cve', 'cryptocurrency', 'decoded',
'domain', 'etag', 'favicon', 'file-name', 'hhhash','item', 'image', 'message', 'ocr', 'pgp',
'qrcode', 'screenshot', 'title', 'user-account', 'username'})
'domain', 'dom-hash', 'etag', 'favicon', 'file-name', 'hhhash','item', 'image', 'message', 'ocr',
'pgp', 'qrcode', 'screenshot', 'title', 'user-account', 'username'})

AIL_OBJECTS_WITH_SUBTYPES = {'chat', 'chat-subchannel', 'cryptocurrency', 'pgp', 'username', 'user-account'}

# TODO by object TYPE ????
AIL_OBJECTS_CORRELATIONS_DEFAULT = sorted({'chat', 'chat-subchannel', 'chat-thread', 'cve', 'cryptocurrency', 'decoded',
'domain', 'favicon', 'file-name', 'item', 'image', 'message', 'ocr', 'pgp',
'qrcode', 'screenshot', 'title', 'user-account', 'username'})
'domain', 'dom-hash', 'favicon', 'file-name', 'item', 'image', 'message',
'ocr', 'pgp', 'qrcode', 'screenshot', 'title', 'user-account', 'username'})

def get_ail_uuid():
ail_uuid = r_serv_db.get('ail:uuid')
Expand Down
4 changes: 4 additions & 0 deletions bin/lib/ail_updates.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ def get_ail_float_version():
'message': 'Compress HAR',
'scripts': ['compress_har.py']
},
'v5.9': {
'message': 'Compute Domain/Items Dom-Hash',
'scripts': ['reprocess_dom_hash.py']
}
}

class AILBackgroundUpdate:
Expand Down
5 changes: 3 additions & 2 deletions bin/lib/correlations_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,14 @@
"cryptocurrency": ["domain", "item", "message", "ocr", "qrcode"],
"cve": ["domain", "item", "message", "ocr", "qrcode"],
"decoded": ["domain", "item", "message", "ocr", "qrcode"],
"domain": ["cve", "cookie-name", "cryptocurrency", "decoded", "etag", "favicon", "hhhash", "item", "pgp", "title", "screenshot", "username"],
"domain": ["cve", "cookie-name", "cryptocurrency", "dom-hash", "decoded", "etag", "favicon", "hhhash", "item", "pgp", "title", "screenshot", "username"],
"dom-hash": ["domain", "item"],
"etag": ["domain"],
"favicon": ["domain", "item"], # TODO Decoded
"file-name": ["chat", "message"],
"hhhash": ["domain"],
"image": ["chat", "chat-subchannel", "chat-thread", "message", "ocr", "qrcode", "user-account"], # TODO subchannel + threads ????
"item": ["cve", "cryptocurrency", "decoded", "domain", "favicon", "pgp", "screenshot", "title", "username"], # chat ???
"item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "pgp", "screenshot", "title", "username"], # chat ???
"message": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "ocr", "pgp", "user-account"],
"ocr": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "image", "message", "pgp", "user-account"],
"pgp": ["domain", "item", "message", "ocr"],
Expand Down
134 changes: 134 additions & 0 deletions bin/lib/objects/DomHashs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*

import os
import sys

from bs4 import BeautifulSoup
from hashlib import sha256
from flask import url_for

# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning)
from pymisp import MISPObject

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.ConfigLoader import ConfigLoader
from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects

config_loader = ConfigLoader()
r_objects = config_loader.get_db_conn("Kvrocks_Objects")
baseurl = config_loader.get_config_str("Notifications", "ail_domain")
config_loader = None


class DomHash(AbstractDaterangeObject):
"""
AIL Title Object.
"""

def __init__(self, id):
super(DomHash, self).__init__('dom-hash', id)

# def get_ail_2_ail_payload(self):
# payload = {'raw': self.get_gzip_content(b64=True),
# 'compress': 'gzip'}
# return payload

# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
def delete(self):
# # TODO:
pass

# def get_content(self, r_type='str'): # TODO Get random item -> compute hash
# if r_type == 'str':
# return self._get_field('content')
# elif r_type == 'bytes':
# return self._get_field('content').encode()

def get_link(self, flask_context=False):
if flask_context:
url = url_for('correlation.show_correlation', type=self.type, id=self.id)
else:
url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}'
return url

def get_svg_icon(self):
return {'style': 'fas', 'icon': '\uf714', 'color': 'grey', 'radius': 5}

def get_misp_object(self):
obj_attrs = []
obj = MISPObject('dom-hash')
first_seen = self.get_first_seen()
last_seen = self.get_last_seen()
if first_seen:
obj.first_seen = first_seen
if last_seen:
obj.last_seen = last_seen
if not first_seen or not last_seen:
self.logger.warning(
f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={first_seen}, last={last_seen}')

obj_attrs.append(obj.add_attribute('dom-hash', value=self.get_id()))
# TODO ############################# URLS
for obj_attr in obj_attrs:
for tag in self.get_tags():
obj_attr.add_tag(tag)
return obj
return None

def get_nb_seen(self):
return self.get_nb_correlation('domain')

def get_meta(self, options=set()):
meta = self._get_meta(options=options)
meta['id'] = self.id
meta['tags'] = self.get_tags(r_list=True)
return meta

def create(self, _first_seen=None, _last_seen=None):
self._create()


def _compute_dom_hash(html_content):
soup = BeautifulSoup(html_content, "lxml")
to_hash = "|".join(t.name for t in soup.findAll()).encode()
return sha256(to_hash).hexdigest()[:32]


def create(content):
obj_id = _compute_dom_hash(content)
obj = DomHash(obj_id)
if not obj.exists():
obj.create()
return obj


class DomHashs(AbstractDaterangeObjects):
"""
Titles Objects
"""
def __init__(self):
super().__init__('dom-hash', DomHash)

def sanitize_id_to_search(self, name_to_search):
return name_to_search


# if __name__ == '__main__':
# # from lib import crawlers
# # from lib.objects import Items
# # for item in Items.get_all_items_objects(filters={'sources': ['crawled']}):
# # title_content = crawlers.extract_title_from_html(item.get_content())
# # if title_content:
# # print(item.id, title_content)
# # title = create_title(title_content)
# # title.add(item.get_date(), item.id)
# titles = Titles()
# # for r in titles.get_ids_iterator():
# # print(r)
# r = titles.search_by_id('f7d57B', r_pos=True, case_sensitive=False)
# print(r)
3 changes: 3 additions & 0 deletions bin/lib/objects/ail_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from lib.objects import Etags
from lib.objects import Favicons
from lib.objects import FilesNames
from lib.objects import DomHashs
from lib.objects import HHHashs
from lib.objects.Items import Item, get_all_items_objects, get_nb_items_objects
from lib.objects import Images
Expand Down Expand Up @@ -91,6 +92,8 @@ def get_object(obj_type, subtype, obj_id):
return Favicons.Favicon(obj_id)
elif obj_type == 'file-name':
return FilesNames.FileName(obj_id)
elif obj_type == 'dom-hash':
return DomHashs.DomHash(obj_id)
elif obj_type == 'hhhash':
return HHHashs.HHHash(obj_id)
elif obj_type == 'image':
Expand Down
24 changes: 24 additions & 0 deletions update/v5.9/Update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*

import os
import sys

sys.path.append(os.environ['AIL_HOME'])
##################################
# Import Project packages
##################################
from update.bin.ail_updater import AIL_Updater
from lib import ail_updates

class Updater(AIL_Updater):
"""default Updater."""

def __init__(self, version):
super(Updater, self).__init__(version)


if __name__ == '__main__':
updater = Updater('v5.9')
updater.run_update()
ail_updates.add_background_update('v5.9')
31 changes: 31 additions & 0 deletions update/v5.9/Update.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

[ -z "$AIL_HOME" ] && echo "Needs the env var AIL_HOME. Run the script from the virtual environment." && exit 1;
[ -z "$AIL_REDIS" ] && echo "Needs the env var AIL_REDIS. Run the script from the virtual environment." && exit 1;
[ -z "$AIL_BIN" ] && echo "Needs the env var AIL_ARDB. Run the script from the virtual environment." && exit 1;
[ -z "$AIL_FLASK" ] && echo "Needs the env var AIL_FLASK. Run the script from the virtual environment." && exit 1;

export PATH=$AIL_HOME:$PATH
export PATH=$AIL_REDIS:$PATH
export PATH=$AIL_BIN:$PATH
export PATH=$AIL_FLASK:$PATH

GREEN="\\033[1;32m"
DEFAULT="\\033[0;39m"

echo -e $GREEN"Shutting down AIL ..."$DEFAULT
bash ${AIL_BIN}/LAUNCH.sh -ks
wait

# SUBMODULES #
git submodule update

echo ""
echo -e $GREEN"Updating AIL VERSION ..."$DEFAULT
echo ""
python ${AIL_HOME}/update/v5.9/Update.py
wait
echo ""
echo ""

exit 0
39 changes: 39 additions & 0 deletions update/v5.9/reprocess_dom_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*

import gzip
import os
import sys

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib import ail_updates
from lib.objects import ail_objects
from lib.objects import DomHashs
from lib.objects.Domains import Domain

if __name__ == '__main__':
update = ail_updates.AILBackgroundUpdate('v5.9')
n = 0
nb_items = ail_objects.card_obj_iterator('item', filters={'sources': ['crawled']})
update.set_nb_to_update(nb_items)

for item in ail_objects.obj_iterator('item', filters={'sources': ['crawled']}):
dom = item.get_domain()
domain = Domain(dom)
i_content = item.get_content()
if domain.exists() and i_content:
date = item.get_date()
# DOM-HASH
dom_hash = DomHashs.create(i_content)
dom_hash.add(date, item)
dom_hash.add_correlation('domain', '', domain.id)

print(domain.id, item.id, dom_hash.id)

update.inc_nb_updated()
n += 1
if n % 100 == 0:
update.update_progress()
2 changes: 2 additions & 0 deletions var/www/Flask_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from blueprints.objects_cookie_name import objects_cookie_name
from blueprints.objects_etag import objects_etag
from blueprints.objects_hhhash import objects_hhhash
from blueprints.objects_dom_hash import objects_dom_hash
from blueprints.chats_explorer import chats_explorer
from blueprints.objects_image import objects_image
from blueprints.objects_ocr import objects_ocr
Expand Down Expand Up @@ -138,6 +139,7 @@ def filter(self, record):
app.register_blueprint(objects_cookie_name, url_prefix=baseUrl)
app.register_blueprint(objects_etag, url_prefix=baseUrl)
app.register_blueprint(objects_hhhash, url_prefix=baseUrl)
app.register_blueprint(objects_dom_hash, url_prefix=baseUrl)
app.register_blueprint(chats_explorer, url_prefix=baseUrl)
app.register_blueprint(objects_image, url_prefix=baseUrl)
app.register_blueprint(objects_ocr, url_prefix=baseUrl)
Expand Down
Loading

0 comments on commit b988f46

Please sign in to comment.