From 44834c29e75b1051fd84592ea8aa3137adb14264 Mon Sep 17 00:00:00 2001 From: Nectariferous Date: Wed, 10 Jul 2024 20:16:07 +0600 Subject: [PATCH] Update download-trade.py --- python/download-trade.py | 203 ++++++++++++++++++++++----------------- 1 file changed, 113 insertions(+), 90 deletions(-) diff --git a/python/download-trade.py b/python/download-trade.py index af75964..a27721f 100755 --- a/python/download-trade.py +++ b/python/download-trade.py @@ -1,114 +1,137 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ - script to download trades. - set the absoluate path destination folder for STORE_DIRECTORY, and run - - e.g. STORE_DIRECTORY=/data/ ./download-trade.py - +Enhanced script to download trades. +Set the absolute path destination folder for STORE_DIRECTORY, and run + +e.g. STORE_DIRECTORY=/data/ ./download-trade.py + +New features: +- Multithreading for faster downloads +- Progress bar for better visibility +- Improved error handling and logging +- Rate limiting to avoid overwhelming the server +- Automatic retries for failed downloads +- CSV export option for downloaded data summary """ import sys -from datetime import * +import os +from datetime import datetime, timedelta import pandas as pd +from concurrent.futures import ThreadPoolExecutor, as_completed +import requests +from requests.exceptions import RequestException +from tqdm import tqdm +import time +import logging +import csv from enums import * -from utility import download_file, get_all_symbols, get_parser, get_start_end_date_objects, convert_to_date_object, \ - get_path - +from utility import download_file, get_all_symbols, get_parser, get_start_end_date_objects, convert_to_date_object, get_path + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# Constants +MAX_RETRIES = 3 +RATE_LIMIT = 0.5 # seconds between requests + +def download_with_retry(path, file_name, date_range, folder, max_retries=MAX_RETRIES): + for attempt in range(max_retries): + try: + download_file(path, file_name, date_range, folder) + time.sleep(RATE_LIMIT) # Rate limiting + return True + except RequestException as e: + logger.warning(f"Attempt {attempt + 1} failed: {e}") + if attempt == max_retries - 1: + logger.error(f"Failed to download {file_name} after {max_retries} attempts") + return False + time.sleep(2 ** attempt) # Exponential backoff def download_monthly_trades(trading_type, symbols, num_symbols, years, months, start_date, end_date, folder, checksum): - current = 0 - date_range = None - - if start_date and end_date: - date_range = start_date + " " + end_date - - if not start_date: - start_date = START_DATE - else: - start_date = convert_to_date_object(start_date) - - if not end_date: - end_date = END_DATE - else: - end_date = convert_to_date_object(end_date) - - print("Found {} symbols".format(num_symbols)) - - for symbol in symbols: - print("[{}/{}] - start download monthly {} trades ".format(current+1, num_symbols, symbol)) - for year in years: - for month in months: - current_date = convert_to_date_object('{}-{}-01'.format(year, month)) - if current_date >= start_date and current_date <= end_date: - path = get_path(trading_type, "trades", "monthly", symbol) - file_name = "{}-trades-{}-{}.zip".format(symbol.upper(), year, '{:02d}'.format(month)) - download_file(path, file_name, date_range, folder) - - if checksum == 1: - checksum_path = get_path(trading_type, "trades", "monthly", symbol) - checksum_file_name = "{}-trades-{}-{}.zip.CHECKSUM".format(symbol.upper(), year, '{:02d}'.format(month)) - download_file(checksum_path, checksum_file_name, date_range, folder) + tasks = [] - current += 1 + with ThreadPoolExecutor(max_workers=10) as executor: + for symbol in symbols: + for year in years: + for month in months: + current_date = convert_to_date_object(f'{year}-{month}-01') + if start_date <= current_date <= end_date: + path = get_path(trading_type, "trades", "monthly", symbol) + file_name = f"{symbol.upper()}-trades-{year}-{month:02d}.zip" + tasks.append(executor.submit(download_with_retry, path, file_name, f"{start_date} {end_date}", folder)) + + if checksum: + checksum_path = get_path(trading_type, "trades", "monthly", symbol) + checksum_file_name = f"{file_name}.CHECKSUM" + tasks.append(executor.submit(download_with_retry, checksum_path, checksum_file_name, f"{start_date} {end_date}", folder)) + + return [task.result() for task in tqdm(as_completed(tasks), total=len(tasks), desc="Downloading monthly trades")] def download_daily_trades(trading_type, symbols, num_symbols, dates, start_date, end_date, folder, checksum): - current = 0 - date_range = None - - if start_date and end_date: - date_range = start_date + " " + end_date - - if not start_date: - start_date = START_DATE - else: - start_date = convert_to_date_object(start_date) - - if not end_date: - end_date = END_DATE - else: - end_date = convert_to_date_object(end_date) + tasks = [] - print("Found {} symbols".format(num_symbols)) - - for symbol in symbols: - print("[{}/{}] - start download daily {} trades ".format(current+1, num_symbols, symbol)) - for date in dates: - current_date = convert_to_date_object(date) - if current_date >= start_date and current_date <= end_date: - path = get_path(trading_type, "trades", "daily", symbol) - file_name = "{}-trades-{}.zip".format(symbol.upper(), date) - download_file(path, file_name, date_range, folder) - - if checksum == 1: - checksum_path = get_path(trading_type, "trades", "daily", symbol) - checksum_file_name = "{}-trades-{}.zip.CHECKSUM".format(symbol.upper(), date) - download_file(checksum_path, checksum_file_name, date_range, folder) - - current += 1 + with ThreadPoolExecutor(max_workers=10) as executor: + for symbol in symbols: + for date in dates: + current_date = convert_to_date_object(date) + if start_date <= current_date <= end_date: + path = get_path(trading_type, "trades", "daily", symbol) + file_name = f"{symbol.upper()}-trades-{date}.zip" + tasks.append(executor.submit(download_with_retry, path, file_name, f"{start_date} {end_date}", folder)) + + if checksum: + checksum_path = get_path(trading_type, "trades", "daily", symbol) + checksum_file_name = f"{file_name}.CHECKSUM" + tasks.append(executor.submit(download_with_retry, checksum_path, checksum_file_name, f"{start_date} {end_date}", folder)) + + return [task.result() for task in tqdm(as_completed(tasks), total=len(tasks), desc="Downloading daily trades")] + +def export_summary(symbols, start_date, end_date, monthly_results, daily_results): + summary_file = "download_summary.csv" + with open(summary_file, 'w', newline='') as csvfile: + writer = csv.writer(csvfile) + writer.writerow(["Symbol", "Start Date", "End Date", "Monthly Downloads", "Daily Downloads", "Total Downloads"]) + + for symbol in symbols: + monthly_count = sum(1 for result in monthly_results if result) + daily_count = sum(1 for result in daily_results if result) + total_count = monthly_count + daily_count + writer.writerow([symbol, start_date, end_date, monthly_count, daily_count, total_count]) + + logger.info(f"Download summary exported to {summary_file}") if __name__ == "__main__": parser = get_parser('trades') args = parser.parse_args(sys.argv[1:]) if not args.symbols: - print("fetching all symbols from exchange") - symbols = get_all_symbols(args.type) - num_symbols = len(symbols) + logger.info("Fetching all symbols from exchange") + symbols = get_all_symbols(args.type) + num_symbols = len(symbols) else: - symbols = args.symbols - num_symbols = len(symbols) - print("fetching {} symbols from exchange".format(num_symbols)) + symbols = args.symbols + num_symbols = len(symbols) + logger.info(f"Fetching {num_symbols} symbols from exchange") + + start_date, end_date = get_start_end_date_objects(args.startDate, args.endDate) if args.dates: - dates = args.dates + dates = args.dates else: - period = convert_to_date_object(datetime.today().strftime('%Y-%m-%d')) - convert_to_date_object( - PERIOD_START_DATE) - dates = pd.date_range(end=datetime.today(), periods=period.days + 1).to_pydatetime().tolist() - dates = [date.strftime("%Y-%m-%d") for date in dates] - if args.skip_monthly == 0: - download_monthly_trades(args.type, symbols, num_symbols, args.years, args.months, args.startDate, args.endDate, args.folder, args.checksum) + dates = pd.date_range(start=start_date, end=end_date).strftime("%Y-%m-%d").tolist() + + monthly_results = [] + daily_results = [] + + if args.skip_monthly == 0: + monthly_results = download_monthly_trades(args.type, symbols, num_symbols, args.years, args.months, start_date, end_date, args.folder, args.checksum) + if args.skip_daily == 0: - download_daily_trades(args.type, symbols, num_symbols, dates, args.startDate, args.endDate, args.folder, args.checksum) - + daily_results = download_daily_trades(args.type, symbols, num_symbols, dates, start_date, end_date, args.folder, args.checksum) + + export_summary(symbols, start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"), monthly_results, daily_results) + + logger.info("Download completed successfully!")