|
@@ -1,142 +1,79 @@
|
1
|
1
|
require 'dotenv/load'
|
2
|
|
-require 'net/http'
|
3
|
|
-require 'nokogiri'
|
4
|
2
|
require 'sequel'
|
5
|
3
|
|
6
|
4
|
require_relative '../db/connect'
|
7
|
5
|
require_relative '../lib/auth'
|
8
|
|
-
|
9
|
|
-def fetch_page(page_number, cookie)
|
10
|
|
- url = URI("http://board.vivalavinyl.com/thread/list/#{page_number}")
|
11
|
|
- http = Net::HTTP.new(url.host, url.port)
|
12
|
|
- request = Net::HTTP::Get.new(url)
|
13
|
|
- request['cookie'] = cookie
|
14
|
|
- response = http.request(request)
|
15
|
|
- Nokogiri.HTML(response.body)
|
16
|
|
-end
|
17
|
|
-
|
18
|
|
-def fetch_thread(thread, cookie)
|
19
|
|
- sleep(0.5)
|
20
|
|
- url =
|
21
|
|
- URI(
|
22
|
|
- "http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true"
|
23
|
|
- )
|
24
|
|
- http = Net::HTTP.new(url.host, url.port)
|
25
|
|
- request = Net::HTTP::Get.new(url)
|
26
|
|
- request['cookie'] = cookie
|
27
|
|
- response = http.request(request)
|
28
|
|
- Nokogiri.HTML(response.body)
|
29
|
|
-end
|
30
|
|
-
|
31
|
|
-def parse_posts(thread, page)
|
32
|
|
- posts = Array.new
|
33
|
|
-
|
34
|
|
- page.css('.post').each do |_post|
|
35
|
|
- post = Hash.new
|
36
|
|
- post[:remote_id] = _post.at_css('ul.view')[:id].split('_')[1].to_i
|
37
|
|
- post[:creator] = _post.at_css('.memberlink').text.strip
|
38
|
|
- date, time =
|
39
|
|
- _post.at_css('.postinfo').text.split('posted this')[1].split('@')
|
40
|
|
- post[:created_at] = Time.parse("#{date} #{time}")
|
41
|
|
- post[:body] = _post.at_css('.postbody').children.map(&:to_html).join.strip
|
42
|
|
- post[:thread_id] = thread[:id]
|
43
|
|
- posts << post
|
|
6
|
+require_relative '../lib/fetch'
|
|
7
|
+require_relative '../lib/insert'
|
|
8
|
+require_relative '../lib/parse'
|
|
9
|
+
|
|
10
|
+class Scraper
|
|
11
|
+ def initialize(first: 0, last: 0, log: false)
|
|
12
|
+ @first = first
|
|
13
|
+ @last = last
|
|
14
|
+ @log = log
|
|
15
|
+ @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
|
|
16
|
+ @db = connect
|
44
|
17
|
end
|
45
|
18
|
|
46
|
|
- posts
|
47
|
|
-end
|
48
|
|
-
|
49
|
|
-def insert_post(post, db)
|
50
|
|
- db.exec(
|
51
|
|
- 'insert into posts (body, timestamp, creator, thread_id, remote_id) values ($1, $2, $3, $4, $5)',
|
52
|
|
- [
|
53
|
|
- post[:body],
|
54
|
|
- post[:created_at].to_s,
|
55
|
|
- post[:creator],
|
56
|
|
- post[:thread_id].to_i,
|
57
|
|
- post[:id].to_i
|
58
|
|
- ]
|
59
|
|
- )
|
60
|
|
-end
|
61
|
|
-
|
62
|
|
-def parse_threads(page)
|
63
|
|
- threads = Array.new
|
64
|
|
- page.css('.even, .odd').each do |row|
|
65
|
|
- thread = Hash.new
|
66
|
|
- thread_link = row.at_css('.subject > a')
|
67
|
|
- next if thread_link.nil?
|
68
|
|
- thread[:remote_id] = thread_link['href'].split('/')[3]
|
69
|
|
- thread[:title] = thread_link.text.strip
|
70
|
|
- creator = row.at_css('.memberlink').text.strip
|
71
|
|
- creator = creator[0..-2] if creator.match(/\+$/)
|
72
|
|
- thread[:creator] = creator
|
73
|
|
- thread[:is_sticky] = !!thread[:title].match(/^Sticky:/)
|
74
|
|
- threads << thread
|
|
19
|
+ def scrape
|
|
20
|
+ (@first..@last).each_with_index do |page_number, page_index|
|
|
21
|
+ page = Fetch.page(page_number, @cookie)
|
|
22
|
+ threads = Parse.threads(page)
|
|
23
|
+
|
|
24
|
+ threads.each do |t|
|
|
25
|
+ next if page_index > 0 && t[:is_sticky]
|
|
26
|
+ no_new_posts = scrape_thread(t)
|
|
27
|
+ if no_new_posts
|
|
28
|
+ next if t[:is_sticky]
|
|
29
|
+ return
|
|
30
|
+ end
|
|
31
|
+ end
|
|
32
|
+ end
|
75
|
33
|
end
|
76
|
|
- threads
|
77
|
|
-end
|
78
|
34
|
|
79
|
|
-def scrape(first: 0, last: 0, log: false)
|
80
|
|
- cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
|
81
|
|
- db = connect
|
|
35
|
+ def scrape_thread(t)
|
|
36
|
+ log t[:title]
|
82
|
37
|
|
83
|
|
- (first..last).each_with_index do |page_number, index|
|
84
|
|
- page = fetch_page(page_number, cookie)
|
85
|
|
- threads = parse_threads(page)
|
86
|
|
- threads.each do |t|
|
87
|
|
- next if index > 0 && t[:is_sticky]
|
|
38
|
+ page = Fetch.thread(t, @cookie)
|
|
39
|
+ first_post = page.at_css('.postinfo:first-child')
|
|
40
|
+ return false if first_post.nil?
|
88
|
41
|
|
89
|
|
- puts t[:title] if log
|
|
42
|
+ t[:created_at] = Parse.thread_created_at(first_post)
|
90
|
43
|
|
91
|
|
- page = fetch_thread(t, cookie)
|
92
|
|
- first_post = page.at_css('.postinfo:first-child')
|
|
44
|
+ thread = @db.from(:threads).first(remote_id: t[:remote_id])
|
|
45
|
+ if thread.nil?
|
|
46
|
+ thread = Insert.thread(t, @db)
|
|
47
|
+ log ' Inserting thread'
|
|
48
|
+ end
|
93
|
49
|
|
94
|
|
- next if first_post.nil?
|
|
50
|
+ scrape_posts(thread, page)
|
|
51
|
+ end
|
95
|
52
|
|
96
|
|
- post_info = first_post.text.split('posted this')
|
97
|
|
- date, time = post_info[1].split('@')
|
98
|
|
- t[:created_at] = Time.parse("#{date} #{time}")
|
|
53
|
+ def scrape_posts(thread, page)
|
|
54
|
+ posts = Parse.posts(thread, page)
|
|
55
|
+ last_post = posts.last
|
|
56
|
+ unless @db.from(:posts).first(remote_id: last_post[:remote_id]).nil?
|
|
57
|
+ log ' No new posts'
|
|
58
|
+ return true
|
|
59
|
+ end
|
99
|
60
|
|
100
|
|
- thread = db.from(:threads).first(remote_id: t[:remote_id])
|
101
|
|
- is_new_thread = thread.nil?
|
102
|
|
- if is_new_thread
|
103
|
|
- puts ' Inserting thread' if log
|
104
|
|
- id =
|
105
|
|
- db.from(:threads).insert(
|
106
|
|
- title: t[:title],
|
107
|
|
- creator: t[:creator],
|
108
|
|
- remote_id: t[:remote_id],
|
109
|
|
- created_at: t[:created_at]
|
110
|
|
- )
|
111
|
|
- t[:id] = id
|
112
|
|
- thread = t
|
|
61
|
+ posts_count = posts.size
|
|
62
|
+ posts.each_with_index do |p, index|
|
|
63
|
+ msg = " Inserting post #{index + 1}/#{posts_count}"
|
|
64
|
+ print msg if @log
|
|
65
|
+ if @db.from(:posts).first(remote_id: p[:remote_id]).nil?
|
|
66
|
+ Insert.post(p, @db)
|
113
|
67
|
end
|
|
68
|
+ print "\b" * msg.size unless index == posts_count - 1 if @log
|
|
69
|
+ end
|
114
|
70
|
|
115
|
|
- posts = parse_posts(thread, page)
|
116
|
|
-
|
117
|
|
- last_post = posts.last
|
118
|
|
- unless db.from(:posts).first(remote_id: last_post[:remote_id]).nil?
|
119
|
|
- next if t[:is_sticky]
|
120
|
|
- break
|
121
|
|
- end
|
|
71
|
+ log ''
|
122
|
72
|
|
123
|
|
- posts_count = posts.size
|
124
|
|
- posts.each_with_index do |p, index|
|
125
|
|
- msg = " Inserting post #{index + 1}/#{posts_count}"
|
126
|
|
- print msg if log
|
127
|
|
- if is_new_thread || db.from(:posts).first(remote_id: p[:remote_id]).nil?
|
128
|
|
- db.from(:posts).insert(
|
129
|
|
- body: p[:body],
|
130
|
|
- created_at: p[:created_at],
|
131
|
|
- thread_id: p[:thread_id],
|
132
|
|
- creator: p[:creator],
|
133
|
|
- remote_id: p[:remote_id]
|
134
|
|
- )
|
135
|
|
- end
|
136
|
|
- print "\b" * msg.size unless index == posts_count - 1 if log
|
137
|
|
- end
|
|
73
|
+ false
|
|
74
|
+ end
|
138
|
75
|
|
139
|
|
- puts if log
|
140
|
|
- end
|
|
76
|
+ def log(msg)
|
|
77
|
+ puts msg if @log
|
141
|
78
|
end
|
142
|
79
|
end
|