Skip to content

Commit

Permalink
Filter by prefix
Browse files Browse the repository at this point in the history
  • Loading branch information
dpetzold committed May 18, 2024
1 parent e7346ef commit 40698fd
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 17 deletions.
16 changes: 11 additions & 5 deletions aws_log_parser/aws/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,24 @@ def list_files(self, bucket, prefix, sort_key, reverse=True):
return sorted(items, key=lambda x: x[sort_key], reverse=reverse)

def read_key(self, bucket, key, endswith=None):
if self.aws_client.verbose:
print(f"Reading s3://{bucket}/{key}")
print(f"Reading s3://{bucket}/{key}")
contents = self.client.get_object(Bucket=bucket, Key=key)
if endswith == ".gz":
with gzip.GzipFile(fileobj=contents["Body"]) as _gz:
yield from [line for line in _gz.read().decode("utf-8").splitlines()]
else:
yield from [line.decode("utf-8") for line in contents["Body"].iter_lines()]

def read_keys(self, bucket, prefix, endswith=None):
def read_keys(self, bucket, prefix, filter=None, endswith=None):
print(f"Reading s3://{bucket}/{prefix}")
for file in self.list_files(bucket, prefix, "LastModified"):
if endswith and not file["Key"].endswith(endswith):
key = file["Key"]
if endswith and not key.endswith(endswith):
print(f"skipping {key} suffix")
continue

yield from self.read_key(bucket, file["Key"], endswith)
if filter and filter not in key:
print(f"skipping {key} filter")
continue

yield from self.read_key(bucket, key, endswith)
12 changes: 9 additions & 3 deletions aws_log_parser/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def read_files(self, pathname):
for p in path.glob(f"**/*{self.file_suffix}"):
yield from self.read_file(p)

def read_s3(self, bucket, prefix, endswith=None):
def read_s3(self, bucket, prefix, filter=None, endswith=None):
"""
Yield parsed log entries from the given s3 url.
Low level function used by ``parse_url``.
Expand All @@ -133,10 +133,15 @@ def read_s3(self, bucket, prefix, endswith=None):
:rtype: Dependant on log_type.
"""
yield from self.parse(
self.aws_client.s3_service.read_keys(bucket, prefix, endswith=endswith)
self.aws_client.s3_service.read_keys(
bucket,
prefix,
filter=filter,
endswith=endswith,
)
)

def read_url(self, url):
def read_url(self, url, filter=None):
"""
Yield parsed log entries from the given url. The file:// and s3://
schemes are currently supported.
Expand Down Expand Up @@ -165,6 +170,7 @@ def read_url(self, url):
yield from self.read_s3(
parsed.netloc,
parsed.path.lstrip("/"),
filter=filter,
endswith=self.file_suffix,
)
else:
Expand Down
52 changes: 43 additions & 9 deletions examples/count-hosts.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,29 @@
#!/bin/env python

import argparse
from rich.console import Console
from rich.table import Table

from collections import Counter
from operator import attrgetter

from aws_log_parser import AwsLogParser, LogType

console = Console()

def count_ips(entries, ip_attr):
counter = Counter(attrgetter(ip_attr)(entry) for entry in entries)

for ip, count in sorted(counter.items()):
print(f"{ip}: {count}")
def count_ips(entries, attr_name, num_results):
table = Table()

counter = Counter(attrgetter(attr_name)(entry) for entry in entries)

table.add_column(attr_name)
table.add_column("count")

for attr, count in sorted(counter.most_common(num_results)):
table.add_row(attr, str(count))

console.print(table)


def main():
Expand All @@ -32,28 +43,51 @@ def main():
type=lambda x: getattr(LogType, x),
help="The the log type.",
)

parser.add_argument(
"--count",
default=20,
type=int,
help="Show this number of results.",
)
parser.add_argument(
"--filter",
help="Filter filenames that match this string.",
)
parser.add_argument(
"--suffix",
help="Filter filenames the specified suffix.",
)
parser.add_argument(
"--profile",
help="The aws profile to use.",
)

parser.add_argument(
"--region",
help="The aws region to use.",
)
parser.add_argument(
"--attr",
help="The attribute to count.",
)

args = parser.parse_args()

ip_attr = "client_ip" if args.log_type == LogType.CloudFront else "client.ip"
count_attr = (
args.attr
if args.attr
else "client_ip"
if args.log_type == LogType.CloudFront
else "client.ip"
)

entries = AwsLogParser(
log_type=args.log_type,
file_suffix=args.suffix,
profile=args.profile,
region=args.region,
).read_url(args.url)
).read_url(args.url, filter=args.filter)

count_ips(entries, ip_attr)
count_ips(entries, count_attr, args.count)


main()

0 comments on commit 40698fd

Please sign in to comment.