Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add results archiving #10

Merged
merged 3 commits into from
Sep 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,4 @@ dmypy.json

# Project
logs/
archives/
5 changes: 4 additions & 1 deletion check.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import signal
from time import sleep, time

from utils import common, analyser, db
from utils import analyser, common, db
from utils import logger as main_logger
from utils.common import archive_results
from utils.logger import logger

db_name: str = ''
Expand Down Expand Up @@ -62,6 +63,8 @@ def main() -> None:
main()
logger.info('Analysis pipeline done')

archive_results()

end_time: float = time()
execution_time: float = end_time - start_time

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ pycodestyle==2.11.0
pycparser==2.21
pyOpenSSL==23.2.0
requests==2.31.0
soupsieve==2.4.1
soupsieve==2.5
tomli==2.0.1
types-pyOpenSSL==23.2.0.2
types-requests==2.31.0.2
Expand Down
17 changes: 5 additions & 12 deletions utils/analyser.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import ssl
from os import path, mkdir, remove, listdir
from shutil import rmtree
from os import mkdir, path
from time import sleep
from urllib.parse import quote, urlparse

import requests
from OpenSSL import crypto

from utils import threading
from utils.common import clean_logs_directory, clean_results_directory
from utils.const import HEADER, SELF_SIGNED_CERTS, UNTRUSTED_CERTS
from utils.logger import logger
from utils.const import UNTRUSTED_CERTS, SELF_SIGNED_CERTS, HEADER


def __get_root_cert(link: str):
Expand Down Expand Up @@ -100,15 +100,8 @@ def run_pipeline(link_batches: tuple, timeout: int) -> None:
last_idx: int = len(link_batches)
idx: int = 0

if path.exists('results'):
for content in listdir('results/'):
content_path = path.join('results/', content)
if path.isfile(content_path):
remove(content_path)
else:
rmtree(content_path, ignore_errors=True)
else:
mkdir('results')
clean_results_directory()
clean_logs_directory()

mkdir('results/government')
mkdir('results/social')
Expand Down
71 changes: 71 additions & 0 deletions utils/common.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
import datetime
import os
import zipfile
from os import listdir, mkdir, path, remove
from shutil import rmtree


def read_links(filename: str) -> list[list[str]]:
MAX_BATCH_SIZE: int = 8000
with open(filename) as f:
Expand Down Expand Up @@ -31,3 +38,67 @@ def get_lines_count_in(file: str) -> int:
exit(1)

return count


def archive_results(save_logs=False) -> None:
"""Save results of the analysis to <date-time>.zip archive.

Keyword arguments:
save_logs -- add logs folder to archive (optional, default False)
"""
# Get the current date and time
current_datetime = datetime.datetime.now()
date_time_str = current_datetime.strftime("%Y-%m-%d-%H-%M-%S")

# Define the archive filename
archive_filename = f"{date_time_str}.zip"

# Check if the 'results' folder exists
if not os.path.exists('results'):
print("Error: 'results' folder not found.")
return

# Create a new ZIP archive
with zipfile.ZipFile(archive_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
# Add files from the 'results' folder to the archive
for root, _, files in os.walk('results'):
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, os.path.relpath(file_path, 'results'))

# Add files from the 'logs' folder to the archive if save_logs is True
if save_logs and os.path.exists('logs'):
for root, _, files in os.walk('logs'):
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, os.path.relpath(file_path, 'logs'))

# Move the created archive to the 'results' folder
if not path.exists('archives'):
mkdir('archives')
os.rename(archive_filename, os.path.join('archives', archive_filename))

print(
f"Results successfully archived to {os.path.join('results', archive_filename)}")


def clean_logs_directory():
if path.exists('logs'):
for content in os.listdir('logs'):
content_path = path.join('logs', content)
if path.isfile(content_path):
remove(content_path)
else:
rmtree(content_path, ignore_errors=True)


def clean_results_directory():
if path.exists('results'):
for content in listdir('results/'):
content_path = path.join('results/', content)
if path.isfile(content_path):
remove(content_path)
else:
rmtree(content_path, ignore_errors=True)
else:
mkdir('results')