Browse Source

Merge branch 'specs'

* specs:
  Specs
  Rename scraper
  Refactor scraper
  Allow passing DB into migration
master
Dylan Baker 2 years ago
parent
commit
ec8e45ba59
8 changed files with 114 additions and 31 deletions
  1. 1
    0
      .gitignore
  2. 2
    1
      Gemfile
  3. 14
    0
      Gemfile.lock
  4. 15
    1
      Rakefile
  5. 1
    2
      db/migrate.rb
  6. 41
    27
      db/scraper.rb
  7. 28
    0
      spec/scraper_spec.rb
  8. 12
    0
      spec/spec_helper.rb

+ 1
- 0
.gitignore View File

@@ -3,3 +3,4 @@ vendor/
3 3
 web/public/*.css
4 4
 web/public/*.js
5 5
 db/log
6
+spec/vcr

+ 2
- 1
Gemfile View File

@@ -13,5 +13,6 @@ gem 'rspec', '~> 3.9'
13 13
 gem 'sassc', '~> 2.2'
14 14
 gem 'sequel', '~> 5.30'
15 15
 gem 'sinatra', '~> 2.0'
16
-
17 16
 gem "truncato", "~> 0.7.11"
17
+gem "vcr", "~> 6.0"
18
+gem "webmock", "~> 3.14"

+ 14
- 0
Gemfile.lock View File

@@ -1,9 +1,14 @@
1 1
 GEM
2 2
   remote: https://rubygems.org/
3 3
   specs:
4
+    addressable (2.8.0)
5
+      public_suffix (>= 2.0.2, < 5.0)
6
+    crack (0.4.5)
7
+      rexml
4 8
     diff-lcs (1.3)
5 9
     dotenv (2.7.5)
6 10
     ffi (1.12.2)
11
+    hashdiff (1.0.1)
7 12
     htmlentities (4.3.4)
8 13
     httparty (0.18.0)
9 14
       mime-types (~> 3.0)
@@ -18,10 +23,12 @@ GEM
18 23
     nokogiri (1.10.9)
19 24
       mini_portile2 (~> 2.4.0)
20 25
     pg (1.2.3)
26
+    public_suffix (4.0.6)
21 27
     rack (2.2.2)
22 28
     rack-protection (2.0.8.1)
23 29
       rack
24 30
     rake (13.0.1)
31
+    rexml (3.2.5)
25 32
     rspec (3.9.0)
26 33
       rspec-core (~> 3.9.0)
27 34
       rspec-expectations (~> 3.9.0)
@@ -48,6 +55,11 @@ GEM
48 55
     truncato (0.7.11)
49 56
       htmlentities (~> 4.3.1)
50 57
       nokogiri (>= 1.7.0, <= 2.0)
58
+    vcr (6.0.0)
59
+    webmock (3.14.0)
60
+      addressable (>= 2.8.0)
61
+      crack (>= 0.3.2)
62
+      hashdiff (>= 0.4.0, < 2.0.0)
51 63
 
52 64
 PLATFORMS
53 65
   ruby
@@ -64,6 +76,8 @@ DEPENDENCIES
64 76
   sequel (~> 5.30)
65 77
   sinatra (~> 2.0)
66 78
   truncato (~> 0.7.11)
79
+  vcr (~> 6.0)
80
+  webmock (~> 3.14)
67 81
 
68 82
 BUNDLED WITH
69 83
    2.1.4

+ 15
- 1
Rakefile View File

@@ -12,8 +12,22 @@ task 'migrate' do
12 12
   migrate
13 13
 end
14 14
 
15
+task 'migrate_test' do
16
+  require_relative './db/migrate'
17
+
18
+  TEST_DB = Sequel.connect(
19
+    adapter: :postgres,
20
+    database: ENV['DB_DATABASE'] + '_test',
21
+    user: ENV['DB_USERNAME'],
22
+    password: ENV['DB_PASSWORD'],
23
+    logger: ENV['APP_ENV'] == 'development' ? Logger.new('db/log') : nil
24
+  )
25
+
26
+  migrate(db: TEST_DB)
27
+end
28
+
15 29
 task 'scrape' do
16
-  require_relative './db/scrape'
30
+  require_relative './db/scraper'
17 31
 
18 32
   should_log = ENV['APP_ENV'] == 'development' || ARGV.include?('--log')
19 33
   scraper = Scraper.new(log: should_log)

+ 1
- 2
db/migrate.rb View File

@@ -3,8 +3,7 @@ require 'sequel'
3 3
 
4 4
 require_relative 'connect'
5 5
 
6
-def migrate
7
-  db = DB
6
+def migrate(db: DB)
8 7
   db.create_table? :threads do
9 8
     primary_key :id
10 9
     String :title

db/scrape.rb → db/scraper.rb View File

@@ -13,21 +13,27 @@ class Scraper
13 13
     @first = first
14 14
     @last = last
15 15
     @log = log
16
+    @no_new_posts = false
16 17
 
17 18
     authenticate!
18 19
   end
19 20
 
20 21
   def scrape
21 22
     (@first..@last).each_with_index do |page_number, page_index|
22
-      page = Fetcher.new(cookie: @cookie).page(page_number)
23
-      threads = Parser.new.threads(page)
23
+      page = fetcher.page(page_number)
24
+      threads = parser.threads(page)
24 25
 
25 26
       threads.each do |t|
26 27
         next if page_index > 0 && t[:is_sticky]
27
-        no_new_posts = scrape_thread(t)
28
+
29
+        scrape_thread(t)
30
+
28 31
         if no_new_posts
29
-          next if t[:is_sticky]
30
-          return
32
+          if t[:is_sticky]
33
+            next
34
+          else
35
+            return
36
+          end
31 37
         end
32 38
       end
33 39
     end
@@ -35,50 +41,58 @@ class Scraper
35 41
 
36 42
   private
37 43
 
38
-  def authenticate!
39
-    @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
40
-
41
-    raise "Error logging into VLV. Check your credentials." if @cookie.nil?
42
-  end
44
+  attr_reader :cookie, :fetcher, :parser, :no_new_posts
43 45
 
44 46
   def scrape_thread(t)
47
+    @no_new_posts = false
48
+
45 49
     log t[:title]
46 50
 
47
-    page = Fetcher.new(cookie: @cookie).thread(t)
51
+    page = fetcher.thread(t)
48 52
     first_post = page.at_css('.postinfo:first-child')
49 53
     return false if first_post.nil?
50 54
 
51 55
     thread = DB.from(:threads).first(remote_id: t[:remote_id])
52 56
     if thread.nil?
53
-      t[:created_at] = Parser.new.thread_created_at(first_post)
54
-      thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky })
55 57
       log '  Inserting thread'
58
+      t[:created_at] = parser.thread_created_at(first_post)
59
+      thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky })
56 60
     end
57 61
 
58 62
     scrape_posts(thread, page)
59 63
   end
60 64
 
61 65
   def scrape_posts(thread, page)
62
-    posts = Parser.new.posts(thread, page)
66
+    posts = parser.posts(thread, page)
63 67
     last_post = posts.last
64
-    unless DB.from(:posts).first(remote_id: last_post[:remote_id]).nil?
65
-      log '  No new posts'
66
-      return true
67
-    end
68 68
 
69
-    posts_count = posts.size
70
-    posts.each_with_index do |p, index|
71
-      msg = "  Inserting post #{index + 1}/#{posts_count}"
72
-      print msg if @log
73
-      if DB.from(:posts).first(remote_id: p[:remote_id]).nil?
74
-        VLV::Post.create(p)
69
+    if DB.from(:posts).first(remote_id: last_post[:remote_id]).nil?
70
+      db_posts = VLV::Post.where(remote_id: posts.map { |p| p[:remote_id] }).all
71
+
72
+      posts = posts.each_with_index.map do |post|
73
+        if db_posts.detect { |db_post| db_post.remote_id == post[:remote_id] }.nil?
74
+          post
75
+        end
75 76
       end
76
-      print "\b" * msg.size unless index == posts_count - 1 if @log
77
+
78
+      log "  Inserting #{posts.size} posts"
79
+      VLV::Post.multi_insert(posts.compact)
80
+    else
81
+      no_new_posts!
82
+      log '  No new posts'
77 83
     end
84
+  end
78 85
 
79
-    log ''
86
+  def no_new_posts!
87
+    @no_new_posts = true
88
+  end
89
+
90
+  def authenticate!
91
+    @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
92
+    raise "Error logging into VLV. Check your credentials." if @cookie.nil?
80 93
 
81
-    false
94
+    @fetcher = Fetcher.new(cookie: cookie)
95
+    @parser = Parser.new
82 96
   end
83 97
 
84 98
   def log(msg)

+ 28
- 0
spec/scraper_spec.rb View File

@@ -0,0 +1,28 @@
1
+require 'dotenv/load'
2
+require 'spec_helper'
3
+
4
+require_relative '../db/scraper'
5
+
6
+RSpec.describe Scraper do
7
+  around(:each) do |example|
8
+    DB.transaction(rollback: :always, auto_savepoint: true) { example.run }
9
+  end
10
+
11
+  subject { described_class.new(log: true) }
12
+
13
+  describe '#initialize' do
14
+    it 'authenticates' do
15
+      expect(subject.send(:cookie)).to_not be_nil
16
+    end
17
+  end
18
+
19
+  describe '#scrape' do
20
+    it 'creates new threads and posts' do
21
+      VCR.use_cassette "create_new_threads_and_posts" do
22
+        expect { subject.scrape }
23
+          .to  change { VLV::Thread.count }.by(109)
24
+          .and change { VLV::Post.count   }.by_at_least(5000)
25
+      end
26
+    end
27
+  end
28
+end

+ 12
- 0
spec/spec_helper.rb View File

@@ -0,0 +1,12 @@
1
+require 'vcr'
2
+require "webmock/rspec"
3
+
4
+ENV["DB_DATABASE"] = ENV["DB_DATABASE"] + '_test'
5
+
6
+WebMock.disable_net_connect!(allow_localhost: true)
7
+
8
+VCR.configure do |c|
9
+  c.cassette_library_dir = "spec/vcr"
10
+  c.hook_into :webmock
11
+  c.ignore_localhost = true
12
+end

Loading…
Cancel
Save