-
Notifications
You must be signed in to change notification settings - Fork 4
/
pu_course_crawler.rb
129 lines (119 loc) · 4.86 KB
/
pu_course_crawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
##
# 靜宜課程爬蟲
# http://alcat.pu.edu.tw/2011courseAbstract/main.php?type=mutinew&lang=zh
#
module CourseCrawler::Crawlers
class PuCourseCrawler < CourseCrawler::Base
DAYS = {
"一" => 1,
"二" => 2,
"三" => 3,
"四" => 4,
"五" => 5,
"六" => 6,
"日" => 7,
}
PERIODS = {
"1" => 1,
"2" => 2,
"3" => 3,
"4" => 4,
"午" => 5,
"5" => 6,
"6" => 7,
"7" => 8,
"8" => 9,
"9" => 10,
"10" => 11,
"11" => 12,
"12" => 13,
"13" => 14,
}
def initialize year: nil, term: nil, update_progress: nil, after_each: nil
@year = year || current_year
@term = term || current_term
@update_progress_proc = update_progress
@after_each_proc = after_each
@query_url = 'http://alcat.pu.edu.tw/2011courseAbstract/main.php?type=mutinew&lang=zh'
end
def courses
@courses = []
r = %x{curl -s #{@query_url}}
doc = Nokogiri::HTML(r)
doc.css('select[name="opunit"] option:not(:first-child)').map{|opt| [opt[:value], opt.text]}.each do |dep_c, dep_n|
r = `curl -s -d "ls_yearsem=#{@year-1911}#{@term}&selectno=&weekday=§ion=&cus_select=&classattri=1&subjname=&teaname=&opunit=#{dep_c}&opclass=&lessonlang=&search=%E6%90%9C%E5%B0%8B&click_ok=Y" #{@query_url}`
doc = Nokogiri::HTML(r)
doc.css('table[class="table_info"] tr:nth-child(n+2)').map{|tr| tr}.each do |tr|
next if tr.text.include?("經濟部智慧財產局校園二手教科書網")
data = tr.css('td').map{|td| td.text}
data[5] = data[5].scan(/\d/)[0].to_i if data[5] != nil
syllabus_url = "http://alcat.pu.edu.tw" + tr.css('td a').map{|a| a[:href]}[0][2..-1] if tr.css('td a').map{|a| a[:href]}[0] != nil
time_period_regex = /(?<day>[#{DAYS.to_a.join}])(?<peri_loc>(\ \s?(?<period>((\d+)?午?\、?)+)\:?(?<location>[伯思靜任一二計方格主體高室田游保]?[鐸源安垣研濟倫顧育校外徑場泳]?[館外池網]?\球?\場?(\d+)?))+)/
course_days, course_periods, course_locations = [], [], []
if data[7] != nil
course_time_location = data[7].scan(time_period_regex)
course_time_location.each do |k, v|
# \uFF1A for fullwith colon ':'
# \u3001 for '、'
v.scan(/(?<period>([\d午][\d]?\u3001?)+)\uFF1A/)[0][0].split('、').each do |period|
course_days << DAYS[k]
course_periods << PERIODS[period]
course_locations << v.split(':')[-1]
end
end
end
### !!!課程代碼重複是因為一個課程有多位教師(官方設定的)!!!
general_code = data[0].scan(/\w+/)[0]
next if general_code.nil?
course = {
year: @year, # 西元年
term: @term, # 學期 (第一學期=1,第二學期=2)
name: data[3].scan(/(?<name>(\S+\s?)+)/)[0][0], # 課程名稱
lecturer: data[6].scan(/\S+/)[0], # 授課教師
credits: data[5], # 學分數
code: "#{@year}-#{@term}-#{general_code}",
general_code: general_code,
# general_code: data[0], # 選課代碼
url: syllabus_url, # 課程大綱之類的連結
required: data[2].include?('必'), # 必修或選修
department: "#{dep_n} #{data[1].scan(/\S+/)[0]}", # 開課系所
# department_code: dep_c,
# note: data[9], # 備註說明
# term_type: data[4], # 學期別
# people_last: data[8], # 目前餘額(人數)
day_1: course_days[0],
day_2: course_days[1],
day_3: course_days[2],
day_4: course_days[3],
day_5: course_days[4],
day_6: course_days[5],
day_7: course_days[6],
day_8: course_days[7],
day_9: course_days[8],
period_1: course_periods[0],
period_2: course_periods[1],
period_3: course_periods[2],
period_4: course_periods[3],
period_5: course_periods[4],
period_6: course_periods[5],
period_7: course_periods[6],
period_8: course_periods[7],
period_9: course_periods[8],
location_1: course_locations[0],
location_2: course_locations[1],
location_3: course_locations[2],
location_4: course_locations[3],
location_5: course_locations[4],
location_6: course_locations[5],
location_7: course_locations[6],
location_8: course_locations[7],
location_9: course_locations[8],
}
@after_each_proc.call(course: course) if @after_each_proc
@courses << course
end
end
@courses
end
end
end