|
@@ -16,6 +16,7 @@ def fetch_page(page_number, cookie)
|
16
|
16
|
end
|
17
|
17
|
|
18
|
18
|
def fetch_thread(thread, cookie)
|
|
19
|
+ sleep(0.5)
|
19
|
20
|
url =
|
20
|
21
|
URI(
|
21
|
22
|
"http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true"
|
|
@@ -74,7 +75,7 @@ def parse_threads(page)
|
74
|
75
|
threads
|
75
|
76
|
end
|
76
|
77
|
|
77
|
|
-def scrape(first: 0, last: 0)
|
|
78
|
+def scrape(first: 0, last: 0, log: false)
|
78
|
79
|
cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
|
79
|
80
|
db = connect
|
80
|
81
|
|
|
@@ -82,9 +83,11 @@ def scrape(first: 0, last: 0)
|
82
|
83
|
page = fetch_page(page_number, cookie)
|
83
|
84
|
threads = parse_threads(page)
|
84
|
85
|
threads.each do |t|
|
85
|
|
- sleep(1)
|
86
|
86
|
is_sticky = t[:title].match(/^Sticky:/)
|
87
|
87
|
next if index > 0 && is_sticky
|
|
88
|
+
|
|
89
|
+ puts t[:title] if log
|
|
90
|
+
|
88
|
91
|
page = fetch_thread(t, cookie)
|
89
|
92
|
first_post = page.at_css('.postinfo:first-child')
|
90
|
93
|
|
|
@@ -97,7 +100,7 @@ def scrape(first: 0, last: 0)
|
97
|
100
|
thread = db.from(:threads).first(remote_id: t[:remote_id])
|
98
|
101
|
is_new_thread = thread.nil?
|
99
|
102
|
if is_new_thread
|
100
|
|
- puts ' Inserting thread'
|
|
103
|
+ puts ' Inserting thread' if log
|
101
|
104
|
id =
|
102
|
105
|
db.from(:threads).insert(
|
103
|
106
|
title: t[:title],
|
|
@@ -120,7 +123,7 @@ def scrape(first: 0, last: 0)
|
120
|
123
|
posts_count = posts.size
|
121
|
124
|
posts.each_with_index do |p, index|
|
122
|
125
|
msg = " Inserting post #{index + 1}/#{posts_count}"
|
123
|
|
- print msg
|
|
126
|
+ print msg if log
|
124
|
127
|
if is_new_thread || db.from(:posts).first(remote_id: p[:remote_id]).nil?
|
125
|
128
|
db.from(:posts).insert(
|
126
|
129
|
body: p[:body],
|
|
@@ -130,10 +133,10 @@ def scrape(first: 0, last: 0)
|
130
|
133
|
remote_id: p[:remote_id]
|
131
|
134
|
)
|
132
|
135
|
end
|
133
|
|
- print "\b" * msg.size unless index == posts_count - 1
|
|
136
|
+ print "\b" * msg.size unless index == posts_count - 1 if log
|
134
|
137
|
end
|
135
|
138
|
|
136
|
|
- puts
|
|
139
|
+ puts if log
|
137
|
140
|
end
|
138
|
141
|
end
|
139
|
142
|
end
|