Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

meetings_calendar crawler added #22

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ Get [National Assembly meetings](http://likms.assembly.go.kr/record/).
cd meetings
python crawl.py

#### meetings_calendar
Get [National Assembly meetings calendar](http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemSchCal.do).

cd meetings_calendar
python get.py 2014-11-01 2014-11-11 # To get meetings schedule from 2014-11-01 to 2014-11-11 or
python get.py 2014-11-01 # To get meetings schedule at 2014-11-01

#### national_assembly
Get member information from the [Korean National Assembly](http://www.assembly.go.kr/).

Expand Down
111 changes: 111 additions & 0 deletions meetings_calendar/get.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-

import os
import io
import urllib2
import html5lib
import datetime
import re
import sys

base_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemSchCalInfoAjax.do?currentPage=&movePageNum=&rowPerPage=1000&gubun=&agendaid=&committee_id=&board_id=&record_id=&returnPage=&weekday=&today=&calendarMove=&showDt=&meetingday=%s'

link_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemScanCalDetail.do?gubun=%s&agendaid=%s&committee_id=%s&board_id=%s&record_id=%s'

sources_dir = './sources'

header = '"date","time","type","title","session","sitting","committee","url"\n'

xpath_title = '//a[contains(@onclick, "jsDetail")]/text()'
xpath_link_params = '//a[contains(@onclick, "jsDetail")]/@onclick'
xpath_datetime = '//dd/text()'
xpath_committee = '//dd/span/text()'

def is_dashed(str):
if str.count('-') > 0:
return True
else:
return False

def crawl(url, directory, filename):
if not os.path.exists(directory):
os.makedirs(directory)

r = urllib2.urlopen(url)
with open('%s/%s.html' % (directory, filename), 'w') as f:
f.write(r.read())

def get_webpage(f):
page = html5lib.HTMLParser(\
tree=html5lib.treebuilders.getTreeBuilder("lxml"),\
namespaceHTMLElements=False)
p = page.parse(f, encoding='utf-8')
return p

def get_link_url(gubun, agendaid, committee_id, board_id, record_id):
return (link_url % (gubun, agendaid, committee_id, board_id, record_id))

def parse_meeting_schedule(filename):
date_length = len('0000-00-00') + 1

session_re = re.compile(u'제(?P<session>[0-9]+)회')
sitting_re = re.compile(u'제(?P<sitting>[0-9]+)차')

with open(filename, 'r') as f:
p = get_webpage(f)

raw_titles = p.xpath(xpath_title)[0:]
link_params = p.xpath(xpath_link_params)[0:]
datetimes = p.xpath(xpath_datetime)[0:]
committes = p.xpath(xpath_committee)[0:]

datetimes = [datetime for datetime in datetimes if datetime.strip() != '']
link_params = [link_param.replace('jsDetail(', '').replace(');return false;', '') for link_param in link_params]

dates = [datetime[:date_length] for datetime in datetimes]
times = [datetime[date_length:] for datetime in datetimes]
types = [title[title.find('[')+1:title.find(']')] for title in raw_titles]
titles = [title[title.find(']')+2:] for title in raw_titles]
sessions = [session_re.findall(title)[0] for title in titles]
sittings = [sitting_re.findall(title)[0] for title in titles]
links = [eval('get_link_url(%s)' % link_param) for link_param in link_params]

return zip(dates, times, types, titles, sessions, sittings, committes, links)

def get_meeting_list(start, end):
if is_dashed(start):
start = start.replace('-', '')

if is_dashed(end):
end = end.replace('-', '')

startDt = datetime.datetime.strptime(start, '%Y%m%d').date()
endDt = datetime.datetime.strptime(end, '%Y%m%d').date()

td = datetime.timedelta(days=1)

csv_filename = 'meetings_%s_%s.csv' % (start, end)

with open('%s/%s' % (sources_dir, csv_filename), 'wa') as f:
f.write(header.encode('utf-8'))
while startDt <= endDt:
filename = str(startDt).replace('-', '')
crawl(('%s' % base_url) % filename, sources_dir, filename)
result = parse_meeting_schedule(('%s/%s.html' % (sources_dir, filename)))
f.write('\n'.join(\
['"%s","%s","%s","%s","%s","%s","%s","%s"' % (date, time, type, title, session, sitting, committee, link) for date, time, type, title, session, sitting, committee, link in result]
).encode('utf-8'))
f.write('\n')
startDt = startDt + td

print 'parsed to %s' % csv_filename

if __name__=='__main__':
if len(sys.argv) is 1:
print 'usage: python get.py YYYY-MM-DD YYYY-MM-DD'
print ' python get.py YYYY-MM-DD'
elif len(sys.argv) is 2:
get_meeting_list(sys.argv[1], sys.argv[1])
elif len(sys.argv) is 3:
get_meeting_list(sys.argv[1], sys.argv[2])