From 2ca2055abfc422365b00d6dd7c1f93cef7d42aba Mon Sep 17 00:00:00 2001 From: Youngkyoung Lee Date: Wed, 12 Nov 2014 18:47:19 +0900 Subject: [PATCH 1/3] meeting schedule crawler (in progress) --- meetings_calendar/get.py | 113 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 meetings_calendar/get.py diff --git a/meetings_calendar/get.py b/meetings_calendar/get.py new file mode 100644 index 0000000..b25ffee --- /dev/null +++ b/meetings_calendar/get.py @@ -0,0 +1,113 @@ +#! /usr/bin/python2.7 +# -*- coding: utf-8 -*- + +import os +import io +import urllib2 +import html5lib +import datetime +import re + +base_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemSchCalInfoAjax.do?currentPage=&movePageNum=&rowPerPage=1000&gubun=&agendaid=&committee_id=&board_id=&record_id=&returnPage=&weekday=&today=&calendarMove=&showDt=&meetingday=%s' + +link_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemScanCalDetail.do?gubun=%s&agendaid=%s&committee_id=%s&board_id=%s&record_id=%s' + +sources_dir = './sources' + +header = '"date","time","type","session","sitting","committee","url"' + +xpath_title = '//a[contains(@onclick, "jsDetail")]/text()' +xpath_link_params = '//a[contains(@onclick, "jsDetail")]/@onclick' +xpath_datetime = '//dd/text()' +xpath_committee = '//dd/span/text()' + +def is_dashed(str): + if str.count('-') > 0: + return True + else: + return False + +def crawl(url, directory, filename): + if not os.path.exists(directory): + os.makedirs(directory) + + r = urllib2.urlopen(url) + with open('%s/%s.html' % (directory, filename), 'w') as f: + f.write(r.read()) + +def get_webpage(f): + page = html5lib.HTMLParser(\ + tree=html5lib.treebuilders.getTreeBuilder("lxml"),\ + namespaceHTMLElements=False) + p = page.parse(f, encoding='utf-8') + return p + +def get_link_url(gubun, agendaid, committee_id, board_id, record_id): + return (link_url % (gubun, agendaid, committee_id, board_id, record_id)) + +def parse_meeting_schedule(filename): + date_length = len('0000-00-00') + 1 + + session_re = re.compile(u'제(?P[0-9]+)회') + sitting_re = re.compile(u'제(?P[0-9]+)차') + + with open(filename, 'r') as f: + p = get_webpage(f) + + raw_titles = p.xpath(xpath_title)[0:] + link_params = p.xpath(xpath_link_params)[0:] + datetimes = p.xpath(xpath_datetime)[0:] + committes = p.xpath(xpath_committee)[0:] + + datetimes = [datetime for datetime in datetimes if datetime.strip() != ''] + + dates = [datetime[:date_length] for datetime in datetimes] + times = [datetime[date_length:] for datetime in datetimes] + + types = [title[title.find('[')+1:title.find(']')] for title in raw_titles] + titles = [title[title.find(']')+2:] for title in raw_titles] + + sessions = [title[title.find(u'제')+1:title.find(u'회')] for title in titles] + + # sessions = [session_re.match(title.encode('utf-8')).group('session') for title in raw_titles] + + # sessions = [session_re.match(title).group('session') for title in raw_titles] + + # print get_link_url('CMMTT', '0','2005110000004', '2006011000140', '2014110049524') + + for date, time, type, title, session, m in zip(dates, times, types, titles, sessions, committes): + print ('"%s","%s","%s","%s","%s","%s"' % (\ + date.strip().encode('utf-8'),\ + time.strip().encode('utf-8'),\ + type.strip().encode('utf-8'),\ + title.strip().encode('utf-8'),\ + session.strip().encode('utf-8'),\ + m.strip().encode('utf-8'))) + + return zip(dates, times, titles, committes) + +def get_meeting_details(code): + return + +def get_meeting_list(start, end): + if is_dashed(start): + start = start.replace('-', '') + + if is_dashed(end): + end = end.replace('-', '') + + startDt = datetime.datetime.strptime(start, '%Y%m%d').date() + endDt = datetime.datetime.strptime(end, '%Y%m%d').date() + + td = datetime.timedelta(days=1) + + print header + + while startDt <= endDt: + filename = str(startDt).replace('-', '') + crawl(('%s' % base_url) % filename, sources_dir, filename) + parse_meeting_schedule(('%s/%s.html' % (sources_dir, filename))) + startDt = startDt + td + +if __name__=='__main__': + get_meeting_list('2014-10-13', '2014-11-13') From 8d431a98d8fb75a83abcc5a568743ba48bc98273 Mon Sep 17 00:00:00 2001 From: Youngkyoung Lee Date: Thu, 13 Nov 2014 01:08:41 +0900 Subject: [PATCH 2/3] meetings_calendar crawler added. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 의사일정 크롤러 추가 --- meetings_calendar/get.py | 58 +++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/meetings_calendar/get.py b/meetings_calendar/get.py index b25ffee..bb7a1d1 100644 --- a/meetings_calendar/get.py +++ b/meetings_calendar/get.py @@ -7,6 +7,7 @@ import html5lib import datetime import re +import sys base_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemSchCalInfoAjax.do?currentPage=&movePageNum=&rowPerPage=1000&gubun=&agendaid=&committee_id=&board_id=&record_id=&returnPage=&weekday=&today=&calendarMove=&showDt=&meetingday=%s' @@ -14,7 +15,7 @@ sources_dir = './sources' -header = '"date","time","type","session","sitting","committee","url"' +header = '"date","time","type","title","session","sitting","committee","url"\n' xpath_title = '//a[contains(@onclick, "jsDetail")]/text()' xpath_link_params = '//a[contains(@onclick, "jsDetail")]/@onclick' @@ -60,34 +61,17 @@ def parse_meeting_schedule(filename): committes = p.xpath(xpath_committee)[0:] datetimes = [datetime for datetime in datetimes if datetime.strip() != ''] + link_params = [link_param.replace('jsDetail(', '').replace(');return false;', '') for link_param in link_params] dates = [datetime[:date_length] for datetime in datetimes] times = [datetime[date_length:] for datetime in datetimes] - types = [title[title.find('[')+1:title.find(']')] for title in raw_titles] titles = [title[title.find(']')+2:] for title in raw_titles] + sessions = [session_re.findall(title)[0] for title in titles] + sittings = [sitting_re.findall(title)[0] for title in titles] + links = [eval('get_link_url(%s)' % link_param) for link_param in link_params] - sessions = [title[title.find(u'제')+1:title.find(u'회')] for title in titles] - - # sessions = [session_re.match(title.encode('utf-8')).group('session') for title in raw_titles] - - # sessions = [session_re.match(title).group('session') for title in raw_titles] - - # print get_link_url('CMMTT', '0','2005110000004', '2006011000140', '2014110049524') - - for date, time, type, title, session, m in zip(dates, times, types, titles, sessions, committes): - print ('"%s","%s","%s","%s","%s","%s"' % (\ - date.strip().encode('utf-8'),\ - time.strip().encode('utf-8'),\ - type.strip().encode('utf-8'),\ - title.strip().encode('utf-8'),\ - session.strip().encode('utf-8'),\ - m.strip().encode('utf-8'))) - - return zip(dates, times, titles, committes) - -def get_meeting_details(code): - return + return zip(dates, times, types, titles, sessions, sittings, committes, links) def get_meeting_list(start, end): if is_dashed(start): @@ -101,13 +85,27 @@ def get_meeting_list(start, end): td = datetime.timedelta(days=1) - print header + csv_filename = 'meetings_%s_%s.csv' % (start, end) + + with open('%s/%s' % (sources_dir, csv_filename), 'wa') as f: + f.write(header.encode('utf-8')) + while startDt <= endDt: + filename = str(startDt).replace('-', '') + crawl(('%s' % base_url) % filename, sources_dir, filename) + result = parse_meeting_schedule(('%s/%s.html' % (sources_dir, filename))) + f.write('\n'.join(\ + ['"%s","%s","%s","%s","%s","%s","%s","%s"' % (date, time, type, title, session, sitting, committee, link) for date, time, type, title, session, sitting, committee, link in result] + ).encode('utf-8')) + f.write('\n') + startDt = startDt + td - while startDt <= endDt: - filename = str(startDt).replace('-', '') - crawl(('%s' % base_url) % filename, sources_dir, filename) - parse_meeting_schedule(('%s/%s.html' % (sources_dir, filename))) - startDt = startDt + td + print 'parsed to %s' % csv_filename if __name__=='__main__': - get_meeting_list('2014-10-13', '2014-11-13') + if len(sys.argv) is 1: + print 'usage: python get.py YYYY-MM-DD YYYY-MM-DD' + print ' python get.py YYYY-MM-DD' + elif len(sys.argv) is 2: + get_meeting_list(sys.argv[1], sys.argv[1]) + elif len(sys.argv) is 3: + get_meeting_list(sys.argv[1], sys.argv[2]) \ No newline at end of file From 7cf31dd209c01be2f236d7f2d77a44078612fed0 Mon Sep 17 00:00:00 2001 From: Youngkyoung Lee Date: Thu, 13 Nov 2014 01:14:38 +0900 Subject: [PATCH 3/3] meetings_calendar help comment added. --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 880337e..cfa91a7 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,13 @@ Get [National Assembly meetings](http://likms.assembly.go.kr/record/). cd meetings python crawl.py +#### meetings_calendar +Get [National Assembly meetings calendar](http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemSchCal.do). + + cd meetings_calendar + python get.py 2014-11-01 2014-11-11 # To get meetings schedule from 2014-11-01 to 2014-11-11 or + python get.py 2014-11-01 # To get meetings schedule at 2014-11-01 + #### national_assembly Get member information from the [Korean National Assembly](http://www.assembly.go.kr/).