-
Notifications
You must be signed in to change notification settings - Fork 1
/
importer.py
141 lines (104 loc) · 4.73 KB
/
importer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Yusuke Yamamoto
# Email: [email protected]
# URL: http://hontolab.org/
# Licence: MIT License
""" Wikipedia本文データXMLをMySQLデータベースにインポートするためのクラス
"""
import MySQLdb
import xml.etree.ElementTree as ET
from tqdm import tqdm
NAMESPACE = "{http://www.mediawiki.org/xml/export-0.10/}"
class WikipediaContentImporter(object):
def __init__(self, host='localhost', port=3306, db='jawiki',
user='root', passwd='passwd', charset='utf8'):
self.host = host
self.port = port
self.db = db
self.user = user
self.passwd = passwd
self.charset = charset
def __open_mysql_connection(self):
self.connect = MySQLdb.connect(host=self.host,
port=self.port,
user=self.user,
passwd=self.passwd,
db=self.db, charset=self.charset)
self.cursor = self.connect.cursor()
def __close_mysql_connection(self):
self.cursor.close()
self.connect.close()
def __insert_revision_record(self, page_id, rev_id):
try:
sql = "INSERT INTO {db}.revision (rev_id, rev_page, rev_text_id) VALUES (%s, %s, %s)".format(db=self.db)
self.cursor.execute(sql, (rev_id, page_id, rev_id))
self.connect.commit()
except MySQLdb.IntegrityError:
sql = "UPDATE {db}.revision SET rev_id = %s, rev_text_id = %s WHERE rev_page = %s".format(db=self.db)
self.cursor.execute(sql, (rev_id, rev_id, page_id))
self.connect.commit()
def __insert_text_record(self, rev_id, content):
try:
sql = "INSERT INTO {db}.text (old_id, old_text, old_flags) VALUES (%s, %s, 'utf-8')".format(db=self.db)
self.cursor.execute(sql, (rev_id, content))
self.connect.commit()
except MySQLdb.IntegrityError:
sql = "UPDATE {db}.text SET old_text = %s WHERE old_id = %s".format(db=self.db)
self.cursor.execute(sql, (content, rev_id))
self.connect.commit()
def __init_tables(self):
sql = "TRUNCATE TABLE {db}.revision".format(db=self.db)
self.cursor.execute(sql)
self.connect.commit()
sql = "TRUNCATE TABLE {db}.text".format(db=self.db)
self.cursor.execute(sql)
self.connect.commit()
def __get_ns_value(self, xml_element):
return int(xml_element.find("./{}ns".format(NAMESPACE)).text)
def __get_page_id_value(self, xml_element):
return int(xml_element.find("./{}id".format(NAMESPACE)).text)
def __get_revision_id_value(self, xml_element):
return int(xml_element.find("./{}revision/{}id".format(NAMESPACE, NAMESPACE)).text)
def __get_text_value(self, xml_element):
return xml_element.find("./{}revision/{}text".format(NAMESPACE, NAMESPACE)).text
def __get_title_value(self, xml_element):
return xml_element.find("./{}title".format(NAMESPACE)).text
def import_page_article_xml_serially(self, file_path, extract_count=None, init_table=True):
self.__open_mysql_connection()
if init_table:
# revisionテーブル, textテーブルを初期化する
self.__init_tables()
if not extract_count:
sql = "SELECT COUNT(*) FROM {db}.page".format(db=self.db)
self.cursor.execute(sql)
extract_count = self.cursor.fetchone()[0]
context = ET.iterparse(file_path, events=('start', 'end'))
context = iter(context)
_, root = next(context)
progress_bar = tqdm(total=extract_count)
itercount = 1
for event, elem in context:
if event == "end" and elem.tag == "{}page".format(NAMESPACE):
ns = self.__get_ns_value(elem)
if ns == 0:
page_id = self.__get_page_id_value(elem)
rev_id = self.__get_revision_id_value(elem)
text = self.__get_text_value(elem)
title = self.__get_title_value(elem)
# Insert data into MySQL tables
self.__insert_revision_record(page_id, rev_id)
self.__insert_text_record(rev_id, text)
if itercount >= extract_count:
progress_bar.update(1)
break
else:
itercount += 1
progress_bar.update(1)
# Free memory
root.clear()
# Close MySQL database connection
self.__close_mysql_connection()
# Close progress bar
progress_bar.close()