|
@@ -16,6 +16,7 @@ def fetch_page(page_number, cookie)
|
16
|
16
|
end
|
17
|
17
|
|
18
|
18
|
def fetch_thread(thread, cookie)
|
|
19
|
+ sleep(0.5)
|
19
|
20
|
url =
|
20
|
21
|
URI(
|
21
|
22
|
"http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true"
|
|
@@ -65,25 +66,28 @@ def parse_threads(page)
|
65
|
66
|
thread_link = row.at_css('.subject > a')
|
66
|
67
|
next if thread_link.nil?
|
67
|
68
|
thread[:remote_id] = thread_link['href'].split('/')[3]
|
68
|
|
- thread[:title] = thread_link.text
|
|
69
|
+ thread[:title] = thread_link.text.strip
|
69
|
70
|
creator = row.at_css('.memberlink').text.strip
|
70
|
71
|
creator = creator[0..-2] if creator.match(/\+$/)
|
71
|
72
|
thread[:creator] = creator
|
|
73
|
+ thread[:is_sticky] = !!thread[:title].match(/^Sticky:/)
|
72
|
74
|
threads << thread
|
73
|
75
|
end
|
74
|
76
|
threads
|
75
|
77
|
end
|
76
|
78
|
|
77
|
|
-def scrape(first: 0, last: 0)
|
|
79
|
+def scrape(first: 0, last: 0, log: false)
|
78
|
80
|
cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
|
79
|
81
|
db = connect
|
80
|
82
|
|
81
|
|
- (first..last).each do |page_number|
|
|
83
|
+ (first..last).each_with_index do |page_number, index|
|
82
|
84
|
page = fetch_page(page_number, cookie)
|
83
|
85
|
threads = parse_threads(page)
|
84
|
86
|
threads.each do |t|
|
85
|
|
- puts t[:title]
|
86
|
|
- sleep(1)
|
|
87
|
+ next if index > 0 && t[:is_sticky]
|
|
88
|
+
|
|
89
|
+ puts t[:title] if log
|
|
90
|
+
|
87
|
91
|
page = fetch_thread(t, cookie)
|
88
|
92
|
first_post = page.at_css('.postinfo:first-child')
|
89
|
93
|
|
|
@@ -96,7 +100,7 @@ def scrape(first: 0, last: 0)
|
96
|
100
|
thread = db.from(:threads).first(remote_id: t[:remote_id])
|
97
|
101
|
is_new_thread = thread.nil?
|
98
|
102
|
if is_new_thread
|
99
|
|
- puts ' Inserting thread'
|
|
103
|
+ puts ' Inserting thread' if log
|
100
|
104
|
id =
|
101
|
105
|
db.from(:threads).insert(
|
102
|
106
|
title: t[:title],
|
|
@@ -112,13 +116,14 @@ def scrape(first: 0, last: 0)
|
112
|
116
|
|
113
|
117
|
last_post = posts.last
|
114
|
118
|
unless db.from(:posts).first(remote_id: last_post[:remote_id]).nil?
|
|
119
|
+ next if t[:is_sticky]
|
115
|
120
|
break
|
116
|
121
|
end
|
117
|
122
|
|
118
|
123
|
posts_count = posts.size
|
119
|
124
|
posts.each_with_index do |p, index|
|
120
|
125
|
msg = " Inserting post #{index + 1}/#{posts_count}"
|
121
|
|
- print msg
|
|
126
|
+ print msg if log
|
122
|
127
|
if is_new_thread || db.from(:posts).first(remote_id: p[:remote_id]).nil?
|
123
|
128
|
db.from(:posts).insert(
|
124
|
129
|
body: p[:body],
|
|
@@ -128,10 +133,10 @@ def scrape(first: 0, last: 0)
|
128
|
133
|
remote_id: p[:remote_id]
|
129
|
134
|
)
|
130
|
135
|
end
|
131
|
|
- print "\b" * msg.size unless index == posts_count - 1
|
|
136
|
+ print "\b" * msg.size unless index == posts_count - 1 if log
|
132
|
137
|
end
|
133
|
138
|
|
134
|
|
- puts
|
|
139
|
+ puts if log
|
135
|
140
|
end
|
136
|
141
|
end
|
137
|
142
|
end
|