Browse Source

Refactor scraping

master
Dylan Baker 4 years ago
parent
commit
b113165dc6
5 changed files with 151 additions and 122 deletions
  1. 2
    1
      Rakefile
  2. 58
    121
      db/scrape.rb
  3. 24
    0
      lib/fetch.rb
  4. 21
    0
      lib/insert.rb
  5. 46
    0
      lib/parse.rb

+ 2
- 1
Rakefile View File

@@ -9,7 +9,8 @@ task 'migrate' do
9 9
 end
10 10
 
11 11
 task 'scrape' do
12
-  scrape(log: ENV['APP_ENV'] == 'development')
12
+  scraper = Scraper.new(log: ENV['APP_ENV'] == 'development')
13
+  scraper.scrape
13 14
 end
14 15
 
15 16
 task 'build' do

+ 58
- 121
db/scrape.rb View File

@@ -1,142 +1,79 @@
1 1
 require 'dotenv/load'
2
-require 'net/http'
3
-require 'nokogiri'
4 2
 require 'sequel'
5 3
 
6 4
 require_relative '../db/connect'
7 5
 require_relative '../lib/auth'
8
-
9
-def fetch_page(page_number, cookie)
10
-  url = URI("http://board.vivalavinyl.com/thread/list/#{page_number}")
11
-  http = Net::HTTP.new(url.host, url.port)
12
-  request = Net::HTTP::Get.new(url)
13
-  request['cookie'] = cookie
14
-  response = http.request(request)
15
-  Nokogiri.HTML(response.body)
16
-end
17
-
18
-def fetch_thread(thread, cookie)
19
-  sleep(0.5)
20
-  url =
21
-    URI(
22
-      "http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true"
23
-    )
24
-  http = Net::HTTP.new(url.host, url.port)
25
-  request = Net::HTTP::Get.new(url)
26
-  request['cookie'] = cookie
27
-  response = http.request(request)
28
-  Nokogiri.HTML(response.body)
29
-end
30
-
31
-def parse_posts(thread, page)
32
-  posts = Array.new
33
-
34
-  page.css('.post').each do |_post|
35
-    post = Hash.new
36
-    post[:remote_id] = _post.at_css('ul.view')[:id].split('_')[1].to_i
37
-    post[:creator] = _post.at_css('.memberlink').text.strip
38
-    date, time =
39
-      _post.at_css('.postinfo').text.split('posted this')[1].split('@')
40
-    post[:created_at] = Time.parse("#{date} #{time}")
41
-    post[:body] = _post.at_css('.postbody').children.map(&:to_html).join.strip
42
-    post[:thread_id] = thread[:id]
43
-    posts << post
6
+require_relative '../lib/fetch'
7
+require_relative '../lib/insert'
8
+require_relative '../lib/parse'
9
+
10
+class Scraper
11
+  def initialize(first: 0, last: 0, log: false)
12
+    @first = first
13
+    @last = last
14
+    @log = log
15
+    @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
16
+    @db = connect
44 17
   end
45 18
 
46
-  posts
47
-end
48
-
49
-def insert_post(post, db)
50
-  db.exec(
51
-    'insert into posts (body, timestamp, creator, thread_id, remote_id) values ($1, $2, $3, $4, $5)',
52
-    [
53
-      post[:body],
54
-      post[:created_at].to_s,
55
-      post[:creator],
56
-      post[:thread_id].to_i,
57
-      post[:id].to_i
58
-    ]
59
-  )
60
-end
61
-
62
-def parse_threads(page)
63
-  threads = Array.new
64
-  page.css('.even, .odd').each do |row|
65
-    thread = Hash.new
66
-    thread_link = row.at_css('.subject > a')
67
-    next if thread_link.nil?
68
-    thread[:remote_id] = thread_link['href'].split('/')[3]
69
-    thread[:title] = thread_link.text.strip
70
-    creator = row.at_css('.memberlink').text.strip
71
-    creator = creator[0..-2] if creator.match(/\+$/)
72
-    thread[:creator] = creator
73
-    thread[:is_sticky] = !!thread[:title].match(/^Sticky:/)
74
-    threads << thread
19
+  def scrape
20
+    (@first..@last).each_with_index do |page_number, page_index|
21
+      page = Fetch.page(page_number, @cookie)
22
+      threads = Parse.threads(page)
23
+
24
+      threads.each do |t|
25
+        next if page_index > 0 && t[:is_sticky]
26
+        no_new_posts = scrape_thread(t)
27
+        if no_new_posts
28
+          next if t[:is_sticky]
29
+          return
30
+        end
31
+      end
32
+    end
75 33
   end
76
-  threads
77
-end
78 34
 
79
-def scrape(first: 0, last: 0, log: false)
80
-  cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
81
-  db = connect
35
+  def scrape_thread(t)
36
+    log t[:title]
82 37
 
83
-  (first..last).each_with_index do |page_number, index|
84
-    page = fetch_page(page_number, cookie)
85
-    threads = parse_threads(page)
86
-    threads.each do |t|
87
-      next if index > 0 && t[:is_sticky]
38
+    page = Fetch.thread(t, @cookie)
39
+    first_post = page.at_css('.postinfo:first-child')
40
+    return false if first_post.nil?
88 41
 
89
-      puts t[:title] if log
42
+    t[:created_at] = Parse.thread_created_at(first_post)
90 43
 
91
-      page = fetch_thread(t, cookie)
92
-      first_post = page.at_css('.postinfo:first-child')
44
+    thread = @db.from(:threads).first(remote_id: t[:remote_id])
45
+    if thread.nil?
46
+      thread = Insert.thread(t, @db)
47
+      log '  Inserting thread'
48
+    end
93 49
 
94
-      next if first_post.nil?
50
+    scrape_posts(thread, page)
51
+  end
95 52
 
96
-      post_info = first_post.text.split('posted this')
97
-      date, time = post_info[1].split('@')
98
-      t[:created_at] = Time.parse("#{date} #{time}")
53
+  def scrape_posts(thread, page)
54
+    posts = Parse.posts(thread, page)
55
+    last_post = posts.last
56
+    unless @db.from(:posts).first(remote_id: last_post[:remote_id]).nil?
57
+      log '  No new posts'
58
+      return true
59
+    end
99 60
 
100
-      thread = db.from(:threads).first(remote_id: t[:remote_id])
101
-      is_new_thread = thread.nil?
102
-      if is_new_thread
103
-        puts '  Inserting thread' if log
104
-        id =
105
-          db.from(:threads).insert(
106
-            title: t[:title],
107
-            creator: t[:creator],
108
-            remote_id: t[:remote_id],
109
-            created_at: t[:created_at]
110
-          )
111
-        t[:id] = id
112
-        thread = t
61
+    posts_count = posts.size
62
+    posts.each_with_index do |p, index|
63
+      msg = "  Inserting post #{index + 1}/#{posts_count}"
64
+      print msg if @log
65
+      if @db.from(:posts).first(remote_id: p[:remote_id]).nil?
66
+        Insert.post(p, @db)
113 67
       end
68
+      print "\b" * msg.size unless index == posts_count - 1 if @log
69
+    end
114 70
 
115
-      posts = parse_posts(thread, page)
116
-
117
-      last_post = posts.last
118
-      unless db.from(:posts).first(remote_id: last_post[:remote_id]).nil?
119
-        next if t[:is_sticky]
120
-        break
121
-      end
71
+    log ''
122 72
 
123
-      posts_count = posts.size
124
-      posts.each_with_index do |p, index|
125
-        msg = "  Inserting post #{index + 1}/#{posts_count}"
126
-        print msg if log
127
-        if is_new_thread || db.from(:posts).first(remote_id: p[:remote_id]).nil?
128
-          db.from(:posts).insert(
129
-            body: p[:body],
130
-            created_at: p[:created_at],
131
-            thread_id: p[:thread_id],
132
-            creator: p[:creator],
133
-            remote_id: p[:remote_id]
134
-          )
135
-        end
136
-        print "\b" * msg.size unless index == posts_count - 1 if log
137
-      end
73
+    false
74
+  end
138 75
 
139
-      puts if log
140
-    end
76
+  def log(msg)
77
+    puts msg if @log
141 78
   end
142 79
 end

+ 24
- 0
lib/fetch.rb View File

@@ -0,0 +1,24 @@
1
+require 'net/http'
2
+
3
+module Fetch
4
+  def self.page(page_number, cookie)
5
+    url = URI("http://board.vivalavinyl.com/thread/list/#{page_number}")
6
+    http = Net::HTTP.new(url.host, url.port)
7
+    request = Net::HTTP::Get.new(url)
8
+    request['cookie'] = cookie
9
+    response = http.request(request)
10
+    Nokogiri.HTML(response.body)
11
+  end
12
+
13
+  def self.thread(thread, cookie)
14
+    url =
15
+      URI(
16
+        "http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true"
17
+      )
18
+    http = Net::HTTP.new(url.host, url.port)
19
+    request = Net::HTTP::Get.new(url)
20
+    request['cookie'] = cookie
21
+    response = http.request(request)
22
+    Nokogiri.HTML(response.body)
23
+  end
24
+end

+ 21
- 0
lib/insert.rb View File

@@ -0,0 +1,21 @@
1
+module Insert
2
+  def self.post(post, db)
3
+    db.from(:posts).insert(
4
+      body: post[:body],
5
+      created_at: post[:created_at],
6
+      creator: post[:creator],
7
+      thread_id: post[:thread_id],
8
+      remote_id: post[:remote_id],
9
+    )
10
+  end
11
+
12
+  def self.thread(thread, db)
13
+    id = db.from(:threads).insert(
14
+      title: thread[:title],
15
+      creator: thread[:creator],
16
+      remote_id: thread[:remote_id],
17
+      created_at: thread[:created_at]
18
+    )
19
+    thread.merge(id: id)
20
+  end
21
+end

+ 46
- 0
lib/parse.rb View File

@@ -0,0 +1,46 @@
1
+require 'nokogiri'
2
+
3
+module Parse
4
+  def self.threads(page)
5
+    threads = Array.new
6
+
7
+    page.css('.even, .odd').each do |row|
8
+      thread = Hash.new
9
+      thread_link = row.at_css('.subject > a')
10
+      next if thread_link.nil?
11
+      thread[:remote_id] = thread_link['href'].split('/')[3]
12
+      thread[:title] = thread_link.text.strip
13
+      creator = row.at_css('.memberlink').text.strip
14
+      creator = creator[0..-2] if creator.match(/\+$/)
15
+      thread[:creator] = creator
16
+      thread[:is_sticky] = !!thread[:title].match(/^Sticky:/)
17
+      threads << thread
18
+    end
19
+
20
+    threads
21
+  end
22
+
23
+  def self.posts(thread, page)
24
+    posts = Array.new
25
+
26
+    page.css('.post').each do |_post|
27
+      post = Hash.new
28
+      post[:remote_id] = _post.at_css('ul.view')[:id].split('_')[1].to_i
29
+      post[:creator] = _post.at_css('.memberlink').text.strip
30
+      date, time =
31
+        _post.at_css('.postinfo').text.split('posted this')[1].split('@')
32
+      post[:created_at] = Time.parse("#{date} #{time}")
33
+      post[:body] = _post.at_css('.postbody').children.map(&:to_html).join.strip
34
+      post[:thread_id] = thread[:id]
35
+      posts << post
36
+    end
37
+
38
+    posts
39
+  end
40
+
41
+  def self.thread_created_at(first_post)
42
+    post_info = first_post.text.split('posted this')
43
+    date, time = post_info[1].split('@')
44
+    Time.parse("#{date} #{time}")
45
+  end
46
+end

Loading…
Cancel
Save