Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

show stats for each job #42

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion SpiderKeeper/app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import logging
import traceback

import apscheduler
from apscheduler.schedulers.background import BackgroundScheduler
from flask import Flask
from flask import jsonify
Expand Down
14 changes: 12 additions & 2 deletions SpiderKeeper/app/proxy/spiderctrl.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import datetime
import random
from functools import reduce
import requests
import re

from SpiderKeeper.app import db
from SpiderKeeper.app.spider.model import SpiderStatus, JobExecution, JobInstance, Project, JobPriority
from SpiderKeeper.app.spider.model import SpiderStatus, JobExecution, JobInstance, Project, \
JobPriority


class SpiderServiceProxy(object):
Expand Down Expand Up @@ -115,6 +117,14 @@ def sync_job_status(self, project):
job_execution.start_time = job_execution_info['start_time']
job_execution.end_time = job_execution_info['end_time']
job_execution.running_status = SpiderStatus.FINISHED

res = requests.get(self.log_url(job_execution))
res.encoding = 'utf8'
raw = res.text[-4096:]
match = re.findall(job_execution.RAW_STATS_REGEX, raw, re.DOTALL)
if match:
job_execution.raw_stats = match[0]
job_execution.process_raw_stats()
# commit
db.session.commit()

Expand Down
3 changes: 1 addition & 2 deletions SpiderKeeper/app/schedulers/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import threading
import time

from SpiderKeeper.app import scheduler, app, agent, db
Expand Down Expand Up @@ -29,7 +28,7 @@ def sync_spiders():
def run_spider_job(job_instance_id):
'''
run spider by scheduler
:param job_instance:
:param job_instance_id:
:return:
'''
try:
Expand Down
32 changes: 31 additions & 1 deletion SpiderKeeper/app/spider/model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import datetime
import demjson
import re
from sqlalchemy import desc
from SpiderKeeper.app import db, Base

Expand Down Expand Up @@ -159,6 +161,29 @@ class JobExecution(Base):
running_status = db.Column(db.INTEGER, default=SpiderStatus.PENDING)
running_on = db.Column(db.Text)

raw_stats = db.Column(db.Text)
items_count = db.Column(db.Integer)
warnings_count = db.Column(db.Integer)
errors_count = db.Column(db.Integer)

RAW_STATS_REGEX = '\[scrapy\.statscollectors\][^{]+({[^}]+})'

def process_raw_stats(self):
if self.raw_stats is None:
return
datetime_regex = '(datetime\.datetime\([^)]+\))'
self.raw_stats = re.sub(datetime_regex, r"'\1'", self.raw_stats)
stats = demjson.decode(self.raw_stats)
self.items_count = stats.get('item_scraped_count') or 0
self.warnings_count = stats.get('log_count/WARNING') or 0
self.errors_count = stats.get('log_count/ERROR') or 0

def has_warnings(self):
return not self.raw_stats or not self.items_count or self.warnings_count

def has_errors(self):
return bool(self.errors_count)

def to_dict(self):
job_instance = JobInstance.query.filter_by(id=self.job_instance_id).first()
return {
Expand All @@ -171,7 +196,12 @@ def to_dict(self):
'end_time': self.end_time.strftime('%Y-%m-%d %H:%M:%S') if self.end_time else None,
'running_status': self.running_status,
'running_on': self.running_on,
'job_instance': job_instance.to_dict() if job_instance else {}
'job_instance': job_instance.to_dict() if job_instance else {},
'has_warnings': self.has_warnings(),
'has_errors': self.has_errors(),
'items_count': self.items_count if self.items_count is not None else '-',
'warnings_count': self.warnings_count if self.warnings_count is not None else '-',
'errors_count': self.errors_count if self.errors_count is not None else '-'
}

@classmethod
Expand Down
1 change: 0 additions & 1 deletion SpiderKeeper/app/static/css/app.css
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

.txt-args {
font-size: 10px;
display: block;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
Expand Down
22 changes: 14 additions & 8 deletions SpiderKeeper/app/templates/job_dashboard.html
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,15 @@ <h3 class="box-title">Completed Jobs</h3>
<th style="width: 20px">Priority</th>
<th style="width: 40px">Runtime</th>
<th style="width: 120px">Started</th>
<th style="width: 10px">Items</th>
<th style="width: 10px">Warnings</th>
<th style="width: 10px">Errors</th>
<th style="width: 10px">Log</th>
<th style="width: 10px">Status</th>
</tr>
{% for job in job_status.COMPLETED %}
{% if job.job_instance %}
<tr>
<tr class="{% if job.has_errors %}danger{% elif job.has_warnings %}warning{% endif %}">
<td>{{ job.job_execution_id }}</td>
<td><a href="/project/1/job/periodic#{{ job.job_instance_id }}">{{ job.job_instance_id }}</a></td>
<td>{{ job.job_instance.spider_name }}</td>
Expand All @@ -184,17 +187,20 @@ <h3 class="box-title">Completed Jobs</h3>
{% endif %}
<td>{{ timedelta(job.end_time,job.start_time) }}</td>
<td>{{ job.start_time }}</td>
<td>{{ job.items_count }}</td>
<td>{{ job.warnings_count }}</td>
<td>{{ job.errors_count }}</td>
<td><a href="/project/{{ project.id }}/jobexecs/{{ job.job_execution_id }}/log" target="_blank"
data-toggle="tooltip" data-placement="top" title="{{ job.service_job_execution_id }}">Log</a>
</td>
{% if job.running_status == 2 %}
<td>
<span class="label label-success">FINISHED</span>
</td>
{% else %}
<td>
<span class="label label-danger">CANCELED</span>
</td>
<td>
<span class="label label-success">FINISHED</span>
</td>
{% else %}
<td>
<span class="label label-danger">CANCELED</span>
</td>
{% endif %}
</tr>
{% endif %}
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
aniso8601==1.2.0
APScheduler==3.3.1
click==6.7
demjson==2.2.4
Flask==0.12.1
Flask-BasicAuth==0.2.0
Flask-RESTful==0.3.5
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
'aniso8601==1.2.0',
'APScheduler==3.3.1',
'click==6.7',
'demjson==2.2.4',
'Flask==0.12.1',
'Flask-BasicAuth==0.2.0',
'Flask-RESTful==0.3.5',
Expand Down