Browse Source

Refactor scraper

master
Dylan Baker 2 years ago
parent
commit
db0d10b969
1 changed files with 41 additions and 27 deletions
  1. 41
    27
      db/scrape.rb

+ 41
- 27
db/scrape.rb View File

@@ -13,21 +13,27 @@ class Scraper
13 13
     @first = first
14 14
     @last = last
15 15
     @log = log
16
+    @no_new_posts = false
16 17
 
17 18
     authenticate!
18 19
   end
19 20
 
20 21
   def scrape
21 22
     (@first..@last).each_with_index do |page_number, page_index|
22
-      page = Fetcher.new(cookie: @cookie).page(page_number)
23
-      threads = Parser.new.threads(page)
23
+      page = fetcher.page(page_number)
24
+      threads = parser.threads(page)
24 25
 
25 26
       threads.each do |t|
26 27
         next if page_index > 0 && t[:is_sticky]
27
-        no_new_posts = scrape_thread(t)
28
+
29
+        scrape_thread(t)
30
+
28 31
         if no_new_posts
29
-          next if t[:is_sticky]
30
-          return
32
+          if t[:is_sticky]
33
+            next
34
+          else
35
+            return
36
+          end
31 37
         end
32 38
       end
33 39
     end
@@ -35,50 +41,58 @@ class Scraper
35 41
 
36 42
   private
37 43
 
38
-  def authenticate!
39
-    @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
40
-
41
-    raise "Error logging into VLV. Check your credentials." if @cookie.nil?
42
-  end
44
+  attr_reader :cookie, :fetcher, :parser, :no_new_posts
43 45
 
44 46
   def scrape_thread(t)
47
+    @no_new_posts = false
48
+
45 49
     log t[:title]
46 50
 
47
-    page = Fetcher.new(cookie: @cookie).thread(t)
51
+    page = fetcher.thread(t)
48 52
     first_post = page.at_css('.postinfo:first-child')
49 53
     return false if first_post.nil?
50 54
 
51 55
     thread = DB.from(:threads).first(remote_id: t[:remote_id])
52 56
     if thread.nil?
53
-      t[:created_at] = Parser.new.thread_created_at(first_post)
54
-      thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky })
55 57
       log '  Inserting thread'
58
+      t[:created_at] = parser.thread_created_at(first_post)
59
+      thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky })
56 60
     end
57 61
 
58 62
     scrape_posts(thread, page)
59 63
   end
60 64
 
61 65
   def scrape_posts(thread, page)
62
-    posts = Parser.new.posts(thread, page)
66
+    posts = parser.posts(thread, page)
63 67
     last_post = posts.last
64
-    unless DB.from(:posts).first(remote_id: last_post[:remote_id]).nil?
65
-      log '  No new posts'
66
-      return true
67
-    end
68 68
 
69
-    posts_count = posts.size
70
-    posts.each_with_index do |p, index|
71
-      msg = "  Inserting post #{index + 1}/#{posts_count}"
72
-      print msg if @log
73
-      if DB.from(:posts).first(remote_id: p[:remote_id]).nil?
74
-        VLV::Post.create(p)
69
+    if DB.from(:posts).first(remote_id: last_post[:remote_id]).nil?
70
+      db_posts = VLV::Post.where(remote_id: posts.map { |p| p[:remote_id] }).all
71
+
72
+      posts = posts.each_with_index.map do |post|
73
+        if db_posts.detect { |db_post| db_post.remote_id == post[:remote_id] }.nil?
74
+          post
75
+        end
75 76
       end
76
-      print "\b" * msg.size unless index == posts_count - 1 if @log
77
+
78
+      log "  Inserting #{posts.size} posts"
79
+      VLV::Post.multi_insert(posts.compact)
80
+    else
81
+      no_new_posts!
82
+      log '  No new posts'
77 83
     end
84
+  end
78 85
 
79
-    log ''
86
+  def no_new_posts!
87
+    @no_new_posts = true
88
+  end
89
+
90
+  def authenticate!
91
+    @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
92
+    raise "Error logging into VLV. Check your credentials." if @cookie.nil?
80 93
 
81
-    false
94
+    @fetcher = Fetcher.new(cookie: cookie)
95
+    @parser = Parser.new
82 96
   end
83 97
 
84 98
   def log(msg)

Loading…
Cancel
Save