From 2ca2055abfc422365b00d6dd7c1f93cef7d42aba Mon Sep 17 00:00:00 2001
From: Youngkyoung Lee <majorika@gmail.com>
Date: Wed, 12 Nov 2014 18:47:19 +0900
Subject: [PATCH 1/3] meeting schedule crawler (in progress)

---
 meetings_calendar/get.py | 113 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 meetings_calendar/get.py
diff --git a/meetings_calendar/get.py b/meetings_calendar/get.py
new file mode 100644
index 0000000..b25ffee
--- /dev/null
+++ b/meetings_calendar/get.py
@@ -0,0 +1,113 @@
+#! /usr/bin/python2.7
+# -*- coding: utf-8 -*-
+
+import os
+import io
+import urllib2
+import html5lib
+import datetime
+import re
+
+base_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemSchCalInfoAjax.do?currentPage=&movePageNum=&rowPerPage=1000&gubun=&agendaid=&committee_id=&board_id=&record_id=&returnPage=&weekday=&today=&calendarMove=&showDt=&meetingday=%s'
+
+link_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemScanCalDetail.do?gubun=%s&agendaid=%s&committee_id=%s&board_id=%s&record_id=%s'
+
+sources_dir = './sources'
+
+header = '"date","time","type","session","sitting","committee","url"'
+
+xpath_title = '//a[contains(@onclick, "jsDetail")]/text()'
+xpath_link_params = '//a[contains(@onclick, "jsDetail")]/@onclick'
+xpath_datetime = '//dd/text()'
+xpath_committee = '//dd/span/text()'
+
+def is_dashed(str):
+	if str.count('-') > 0:
+		return True
+	else:
+		return False
+
+def crawl(url, directory, filename):
+	if not os.path.exists(directory):
+		os.makedirs(directory)
+
+	r = urllib2.urlopen(url)
+	with open('%s/%s.html' % (directory, filename), 'w') as f:
+		f.write(r.read())
+
+def get_webpage(f):
+	page = html5lib.HTMLParser(\
+		tree=html5lib.treebuilders.getTreeBuilder("lxml"),\
+		namespaceHTMLElements=False)
+	p = page.parse(f, encoding='utf-8')
+	return p
+
+def get_link_url(gubun, agendaid, committee_id, board_id, record_id):
+	return (link_url % (gubun, agendaid, committee_id, board_id, record_id))
+
+def parse_meeting_schedule(filename):
+	date_length = len('0000-00-00') + 1
+
+	session_re = re.compile(u'제(?P<session>[0-9]+)회')
+	sitting_re = re.compile(u'제(?P<sitting>[0-9]+)차')
+
+	with open(filename, 'r') as f:
+		p = get_webpage(f)
+
+		raw_titles = p.xpath(xpath_title)[0:]
+		link_params = p.xpath(xpath_link_params)[0:]
+		datetimes = p.xpath(xpath_datetime)[0:]
+		committes = p.xpath(xpath_committee)[0:]
+
+		datetimes = [datetime for datetime in datetimes if datetime.strip() != '']
+
+		dates = [datetime[:date_length] for datetime in datetimes]
+		times = [datetime[date_length:] for datetime in datetimes]
+
+		types = [title[title.find('[')+1:title.find(']')] for title in raw_titles]
+		titles = [title[title.find(']')+2:] for title in raw_titles]
+
+		sessions = [title[title.find(u'제')+1:title.find(u'회')] for title in titles]
+
+		# sessions = [session_re.match(title.encode('utf-8')).group('session') for title in raw_titles]
+
+		# sessions = [session_re.match(title).group('session') for title in raw_titles]
+
+		# print get_link_url('CMMTT', '0','2005110000004', '2006011000140', '2014110049524')
+
+		for date, time, type, title, session, m in zip(dates, times, types, titles, sessions, committes):
+			print ('"%s","%s","%s","%s","%s","%s"' % (\
+				date.strip().encode('utf-8'),\
+				time.strip().encode('utf-8'),\
+				type.strip().encode('utf-8'),\
+				title.strip().encode('utf-8'),\
+				session.strip().encode('utf-8'),\
+				m.strip().encode('utf-8')))
+
+		return zip(dates, times, titles, committes)
+
+def get_meeting_details(code):
+	return
+
+def get_meeting_list(start, end):
+	if is_dashed(start):
+		start = start.replace('-', '')
+
+	if is_dashed(end):
+		end = end.replace('-', '')
+
+	startDt = datetime.datetime.strptime(start, '%Y%m%d').date()
+	endDt = datetime.datetime.strptime(end, '%Y%m%d').date()
+
+	td = datetime.timedelta(days=1)
+
+	print header
+
+	while startDt <= endDt:
+		filename = str(startDt).replace('-', '')
+		crawl(('%s' % base_url) % filename, sources_dir, filename)
+		parse_meeting_schedule(('%s/%s.html' % (sources_dir, filename)))
+		startDt = startDt + td
+
+if __name__=='__main__':
+    get_meeting_list('2014-10-13', '2014-11-13')

From 8d431a98d8fb75a83abcc5a568743ba48bc98273 Mon Sep 17 00:00:00 2001
From: Youngkyoung Lee <majorika@gmail.com>
Date: Thu, 13 Nov 2014 01:08:41 +0900
Subject: [PATCH 2/3] meetings_calendar crawler added.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

의사일정 크롤러 추가
---
 meetings_calendar/get.py | 58 +++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/meetings_calendar/get.py b/meetings_calendar/get.py
index b25ffee..bb7a1d1 100644
--- a/meetings_calendar/get.py
+++ b/meetings_calendar/get.py
@@ -7,6 +7,7 @@
 import html5lib
 import datetime
 import re
+import sys
 
 base_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemSchCalInfoAjax.do?currentPage=&movePageNum=&rowPerPage=1000&gubun=&agendaid=&committee_id=&board_id=&record_id=&returnPage=&weekday=&today=&calendarMove=&showDt=&meetingday=%s'
 
@@ -14,7 +15,7 @@
 
 sources_dir = './sources'
 
-header = '"date","time","type","session","sitting","committee","url"'
+header = '"date","time","type","title","session","sitting","committee","url"\n'
 
 xpath_title = '//a[contains(@onclick, "jsDetail")]/text()'
 xpath_link_params = '//a[contains(@onclick, "jsDetail")]/@onclick'
@@ -60,34 +61,17 @@ def parse_meeting_schedule(filename):
 		committes = p.xpath(xpath_committee)[0:]
 
 		datetimes = [datetime for datetime in datetimes if datetime.strip() != '']
+		link_params = [link_param.replace('jsDetail(', '').replace(');return false;', '') for link_param in link_params]
 
 		dates = [datetime[:date_length] for datetime in datetimes]
 		times = [datetime[date_length:] for datetime in datetimes]
-
 		types = [title[title.find('[')+1:title.find(']')] for title in raw_titles]
 		titles = [title[title.find(']')+2:] for title in raw_titles]
+		sessions = [session_re.findall(title)[0] for title in titles]
+		sittings = [sitting_re.findall(title)[0] for title in titles]
+		links = [eval('get_link_url(%s)' % link_param) for link_param in link_params]
 
-		sessions = [title[title.find(u'제')+1:title.find(u'회')] for title in titles]
-
-		# sessions = [session_re.match(title.encode('utf-8')).group('session') for title in raw_titles]
-
-		# sessions = [session_re.match(title).group('session') for title in raw_titles]
-
-		# print get_link_url('CMMTT', '0','2005110000004', '2006011000140', '2014110049524')
-
-		for date, time, type, title, session, m in zip(dates, times, types, titles, sessions, committes):
-			print ('"%s","%s","%s","%s","%s","%s"' % (\
-				date.strip().encode('utf-8'),\
-				time.strip().encode('utf-8'),\
-				type.strip().encode('utf-8'),\
-				title.strip().encode('utf-8'),\
-				session.strip().encode('utf-8'),\
-				m.strip().encode('utf-8')))
-
-		return zip(dates, times, titles, committes)
-
-def get_meeting_details(code):
-	return
+		return zip(dates, times, types, titles, sessions, sittings, committes, links)
 
 def get_meeting_list(start, end):
 	if is_dashed(start):
@@ -101,13 +85,27 @@ def get_meeting_list(start, end):
 
 	td = datetime.timedelta(days=1)
 
-	print header
+	csv_filename = 'meetings_%s_%s.csv' % (start, end)
+
+	with open('%s/%s' % (sources_dir, csv_filename), 'wa') as f:
+		f.write(header.encode('utf-8'))
+		while startDt <= endDt:
+			filename = str(startDt).replace('-', '')
+			crawl(('%s' % base_url) % filename, sources_dir, filename)
+			result = parse_meeting_schedule(('%s/%s.html' % (sources_dir, filename)))
+			f.write('\n'.join(\
+				['"%s","%s","%s","%s","%s","%s","%s","%s"' % (date, time, type, title, session, sitting, committee, link) for date, time, type, title, session, sitting, committee, link in result]
+				).encode('utf-8'))
+			f.write('\n')
+			startDt = startDt + td
 
-	while startDt <= endDt:
-		filename = str(startDt).replace('-', '')
-		crawl(('%s' % base_url) % filename, sources_dir, filename)
-		parse_meeting_schedule(('%s/%s.html' % (sources_dir, filename)))
-		startDt = startDt + td
+	print 'parsed to %s' % csv_filename
 
 if __name__=='__main__':
-    get_meeting_list('2014-10-13', '2014-11-13')
+	if len(sys.argv) is 1:
+		print 'usage: python get.py YYYY-MM-DD YYYY-MM-DD'
+		print '       python get.py YYYY-MM-DD'
+	elif len(sys.argv) is 2:
+		get_meeting_list(sys.argv[1], sys.argv[1])
+	elif len(sys.argv) is 3:
+		get_meeting_list(sys.argv[1], sys.argv[2])
\ No newline at end of file

From 7cf31dd209c01be2f236d7f2d77a44078612fed0 Mon Sep 17 00:00:00 2001
From: Youngkyoung Lee <majorika@gmail.com>
Date: Thu, 13 Nov 2014 01:14:38 +0900
Subject: [PATCH 3/3] meetings_calendar help comment added.

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 880337e..cfa91a7 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,13 @@ Get [National Assembly meetings](http://likms.assembly.go.kr/record/).
     cd meetings
     python crawl.py
 
+#### meetings_calendar
+Get [National Assembly meetings calendar](http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemSchCal.do).
+
+    cd meetings_calendar
+    python get.py 2014-11-01 2014-11-11     # To get meetings schedule from 2014-11-01 to 2014-11-11 or 
+    python get.py 2014-11-01                # To get meetings schedule at 2014-11-01
+
 #### national_assembly
 Get member information from the [Korean National Assembly](http://www.assembly.go.kr/).