-
Notifications
You must be signed in to change notification settings - Fork 4
/
isu_course_crawler.rb
147 lines (128 loc) · 4.98 KB
/
isu_course_crawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
##
# 義守課程爬蟲
# http://netreg.isu.edu.tw/wapp/wapp_sha/wap_s140000_bilingual.asp
#
module CourseCrawler::Crawlers
class IsuCourseCrawler < CourseCrawler::Base
PERIODS = CoursePeriod.find('ISU').code_map.merge("z" => 5)
def initialize year: nil, term: nil, update_progress: nil, after_each: nil
@year = year
@term = term
@update_progress_proc = update_progress
@after_each_proc = after_each
@query_url = %x(curl -s 'http://netreg.isu.edu.tw/wapp/wapp_sha/wap_s140000_bilingual.asp' --compressed)
@ic = Iconv.new('utf-8//IGNORE//translit', 'big5')
end
def courses
@courses = []
puts "get url ..."
doc = Nokogiri::HTML(@ic.iconv(@query_url))
majr_option = doc.css('table')[1].css('tr')[2].css('option')
# 年級 (1~15)
grade_beg = 1
grade_end = 9
# 部別
divi_A="A"
divi_M="M"
divi_I="I"
divi_D="D"
divi_B="B"
divi_G="G"
divi_T="T"
divi_F="T"
for i in 0..majr_option.count - 2
data = []
# 系所
majr_no = majr_option[i].text[0..1]
r = %x(curl -s 'http://netreg.isu.edu.tw/wapp/wapp_sha/wap_s140001.asp' --data 'lange_sel=zh_TW&qry_setyear=#{@year-1911}&qry_setterm=#{@term}&grade_beg=#{grade_beg}&grade_end=#{grade_end}&majr_no=#{majr_no}&divi_A=#{divi_A}&divi_M=#{divi_M}&divi_I=#{divi_I}&divi_D=#{divi_D}&divi_B=#{divi_B}&divi_G=#{divi_G}&divi_T=#{divi_T}&divi_F=#{divi_F}&cr_code=&cr_name=&yepg_sel=+&crdnum_beg=0&crdnum_end=6&apt_code=+&submit1=%B0e%A5X' --compressed)
doc = Nokogiri::HTML(@ic.iconv(r))
#計算課程
count = 1
(0..doc.css('table:nth-child(7) tr:nth-child(n+3)').count-1).each do |tr|
data = mix_data(doc,tr)
puts "Department : " + majr_option[i].text.to_s + " , data crawled : " + count.to_s
count += 1
next if data[0] == ""
if doc.css('table:nth-child(7) tr:nth-child(n+3)')[tr].css('td a')[0] != nil
syllabus_url = "http://netreg.isu.edu.tw/wapp/wapp_sha/#{doc.css('table:nth-child(7) tr:nth-child(n+3)')[tr].css('td a')[0][:href]}"
else
syllabus_url = nil
end
course_days, course_periods, course_locations = data[18],data[19],data[20]
course = {
year: @year, # 西元年
term: @term, # 學期 (第一學期=1,第二學期=2)
name: data[2], # 課程名稱
lecturer: data[4], # 授課教師
credits: data[5].to_i, # 學分數
code: "#{@year}-#{@term}-#{data[0]}_#{data[1]}",
general_code: data[1], # 選課代碼
url: syllabus_url, # 課程大綱之類的連結
required: data[6].include?("必"), # 修別(必選修)
department: data[3], # 開課系級
# department_code: majr_no, # 系所代碼
# notes: data[17], # 備註說明
# people_limit: data[6], # 限制選修人數
# people: data[7], # 修課人數
day_1: course_days[0],
day_2: course_days[1],
day_3: course_days[2],
day_4: course_days[3],
day_5: course_days[4],
day_6: course_days[5],
day_7: course_days[6],
day_8: course_days[7],
day_9: course_days[8],
period_1: course_periods[0],
period_2: course_periods[1],
period_3: course_periods[2],
period_4: course_periods[3],
period_5: course_periods[4],
period_6: course_periods[5],
period_7: course_periods[6],
period_8: course_periods[7],
period_9: course_periods[8],
location_1: course_locations[0],
location_2: course_locations[1],
location_3: course_locations[2],
location_4: course_locations[3],
location_5: course_locations[4],
location_6: course_locations[5],
location_7: course_locations[6],
location_8: course_locations[7],
location_9: course_locations[8],
}
@after_each_proc.call(course: course) if @after_each_proc
@courses << course
end
end
puts "Project finished !!!"
@courses
end
def mix_data doc,tr
# 往後察看下一欄的資訊,如果是本欄延續的資料就合起來~
data = doc.css('table:nth-child(7) tr:nth-child(n+3)')[tr].css('td').map{|td| td.text.gsub(/[\s\r\t\n ]/,"")}
data[18],data[19],data[20] = [],[],[]
(1..data[10..16].length).each do |day|
data[10..16][day-1].scan(/\w/).each do |p|
data[18] << day
data[19] << PERIODS[p]
data[20] << data[9]
end
end
if doc.css('table:nth-child(7) tr:nth-child(n+3)')[tr+1] != nil
data_next = doc.css('table:nth-child(7) tr:nth-child(n+3)')[tr+1].css('td').map{|td| td.text.gsub(/[\s\r\t\n ]/,"")}
else
return data
end
if data_next[0] == ""
data_next = mix_data(doc,tr+1)
data[4] += ",#{data_next[4]}"
data[18] += data_next[18]
data[19] += data_next[19]
data[20] += data_next[20]
end
data
end
end
end