Browse Source

Refactor scraper

master
Dylan Baker 2 years ago
parent
commit
db0d10b969
1 changed files with 41 additions and 27 deletions
  1. 41
    27
      db/scrape.rb

+ 41
- 27
db/scrape.rb View File

13
     @first = first
13
     @first = first
14
     @last = last
14
     @last = last
15
     @log = log
15
     @log = log
16
+    @no_new_posts = false
16
 
17
 
17
     authenticate!
18
     authenticate!
18
   end
19
   end
19
 
20
 
20
   def scrape
21
   def scrape
21
     (@first..@last).each_with_index do |page_number, page_index|
22
     (@first..@last).each_with_index do |page_number, page_index|
22
-      page = Fetcher.new(cookie: @cookie).page(page_number)
23
-      threads = Parser.new.threads(page)
23
+      page = fetcher.page(page_number)
24
+      threads = parser.threads(page)
24
 
25
 
25
       threads.each do |t|
26
       threads.each do |t|
26
         next if page_index > 0 && t[:is_sticky]
27
         next if page_index > 0 && t[:is_sticky]
27
-        no_new_posts = scrape_thread(t)
28
+
29
+        scrape_thread(t)
30
+
28
         if no_new_posts
31
         if no_new_posts
29
-          next if t[:is_sticky]
30
-          return
32
+          if t[:is_sticky]
33
+            next
34
+          else
35
+            return
36
+          end
31
         end
37
         end
32
       end
38
       end
33
     end
39
     end
35
 
41
 
36
   private
42
   private
37
 
43
 
38
-  def authenticate!
39
-    @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
40
-
41
-    raise "Error logging into VLV. Check your credentials." if @cookie.nil?
42
-  end
44
+  attr_reader :cookie, :fetcher, :parser, :no_new_posts
43
 
45
 
44
   def scrape_thread(t)
46
   def scrape_thread(t)
47
+    @no_new_posts = false
48
+
45
     log t[:title]
49
     log t[:title]
46
 
50
 
47
-    page = Fetcher.new(cookie: @cookie).thread(t)
51
+    page = fetcher.thread(t)
48
     first_post = page.at_css('.postinfo:first-child')
52
     first_post = page.at_css('.postinfo:first-child')
49
     return false if first_post.nil?
53
     return false if first_post.nil?
50
 
54
 
51
     thread = DB.from(:threads).first(remote_id: t[:remote_id])
55
     thread = DB.from(:threads).first(remote_id: t[:remote_id])
52
     if thread.nil?
56
     if thread.nil?
53
-      t[:created_at] = Parser.new.thread_created_at(first_post)
54
-      thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky })
55
       log '  Inserting thread'
57
       log '  Inserting thread'
58
+      t[:created_at] = parser.thread_created_at(first_post)
59
+      thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky })
56
     end
60
     end
57
 
61
 
58
     scrape_posts(thread, page)
62
     scrape_posts(thread, page)
59
   end
63
   end
60
 
64
 
61
   def scrape_posts(thread, page)
65
   def scrape_posts(thread, page)
62
-    posts = Parser.new.posts(thread, page)
66
+    posts = parser.posts(thread, page)
63
     last_post = posts.last
67
     last_post = posts.last
64
-    unless DB.from(:posts).first(remote_id: last_post[:remote_id]).nil?
65
-      log '  No new posts'
66
-      return true
67
-    end
68
 
68
 
69
-    posts_count = posts.size
70
-    posts.each_with_index do |p, index|
71
-      msg = "  Inserting post #{index + 1}/#{posts_count}"
72
-      print msg if @log
73
-      if DB.from(:posts).first(remote_id: p[:remote_id]).nil?
74
-        VLV::Post.create(p)
69
+    if DB.from(:posts).first(remote_id: last_post[:remote_id]).nil?
70
+      db_posts = VLV::Post.where(remote_id: posts.map { |p| p[:remote_id] }).all
71
+
72
+      posts = posts.each_with_index.map do |post|
73
+        if db_posts.detect { |db_post| db_post.remote_id == post[:remote_id] }.nil?
74
+          post
75
+        end
75
       end
76
       end
76
-      print "\b" * msg.size unless index == posts_count - 1 if @log
77
+
78
+      log "  Inserting #{posts.size} posts"
79
+      VLV::Post.multi_insert(posts.compact)
80
+    else
81
+      no_new_posts!
82
+      log '  No new posts'
77
     end
83
     end
84
+  end
78
 
85
 
79
-    log ''
86
+  def no_new_posts!
87
+    @no_new_posts = true
88
+  end
89
+
90
+  def authenticate!
91
+    @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
92
+    raise "Error logging into VLV. Check your credentials." if @cookie.nil?
80
 
93
 
81
-    false
94
+    @fetcher = Fetcher.new(cookie: cookie)
95
+    @parser = Parser.new
82
   end
96
   end
83
 
97
 
84
   def log(msg)
98
   def log(msg)

Loading…
Cancel
Save