Browse Source

Merge branch 'specs'

* specs:
  Specs
  Rename scraper
  Refactor scraper
  Allow passing DB into migration
master
Dylan Baker 2 years ago
parent
commit
ec8e45ba59
8 changed files with 114 additions and 31 deletions
  1. 1
    0
      .gitignore
  2. 2
    1
      Gemfile
  3. 14
    0
      Gemfile.lock
  4. 15
    1
      Rakefile
  5. 1
    2
      db/migrate.rb
  6. 41
    27
      db/scraper.rb
  7. 28
    0
      spec/scraper_spec.rb
  8. 12
    0
      spec/spec_helper.rb

+ 1
- 0
.gitignore View File

3
 web/public/*.css
3
 web/public/*.css
4
 web/public/*.js
4
 web/public/*.js
5
 db/log
5
 db/log
6
+spec/vcr

+ 2
- 1
Gemfile View File

13
 gem 'sassc', '~> 2.2'
13
 gem 'sassc', '~> 2.2'
14
 gem 'sequel', '~> 5.30'
14
 gem 'sequel', '~> 5.30'
15
 gem 'sinatra', '~> 2.0'
15
 gem 'sinatra', '~> 2.0'
16
-
17
 gem "truncato", "~> 0.7.11"
16
 gem "truncato", "~> 0.7.11"
17
+gem "vcr", "~> 6.0"
18
+gem "webmock", "~> 3.14"

+ 14
- 0
Gemfile.lock View File

1
 GEM
1
 GEM
2
   remote: https://rubygems.org/
2
   remote: https://rubygems.org/
3
   specs:
3
   specs:
4
+    addressable (2.8.0)
5
+      public_suffix (>= 2.0.2, < 5.0)
6
+    crack (0.4.5)
7
+      rexml
4
     diff-lcs (1.3)
8
     diff-lcs (1.3)
5
     dotenv (2.7.5)
9
     dotenv (2.7.5)
6
     ffi (1.12.2)
10
     ffi (1.12.2)
11
+    hashdiff (1.0.1)
7
     htmlentities (4.3.4)
12
     htmlentities (4.3.4)
8
     httparty (0.18.0)
13
     httparty (0.18.0)
9
       mime-types (~> 3.0)
14
       mime-types (~> 3.0)
18
     nokogiri (1.10.9)
23
     nokogiri (1.10.9)
19
       mini_portile2 (~> 2.4.0)
24
       mini_portile2 (~> 2.4.0)
20
     pg (1.2.3)
25
     pg (1.2.3)
26
+    public_suffix (4.0.6)
21
     rack (2.2.2)
27
     rack (2.2.2)
22
     rack-protection (2.0.8.1)
28
     rack-protection (2.0.8.1)
23
       rack
29
       rack
24
     rake (13.0.1)
30
     rake (13.0.1)
31
+    rexml (3.2.5)
25
     rspec (3.9.0)
32
     rspec (3.9.0)
26
       rspec-core (~> 3.9.0)
33
       rspec-core (~> 3.9.0)
27
       rspec-expectations (~> 3.9.0)
34
       rspec-expectations (~> 3.9.0)
48
     truncato (0.7.11)
55
     truncato (0.7.11)
49
       htmlentities (~> 4.3.1)
56
       htmlentities (~> 4.3.1)
50
       nokogiri (>= 1.7.0, <= 2.0)
57
       nokogiri (>= 1.7.0, <= 2.0)
58
+    vcr (6.0.0)
59
+    webmock (3.14.0)
60
+      addressable (>= 2.8.0)
61
+      crack (>= 0.3.2)
62
+      hashdiff (>= 0.4.0, < 2.0.0)
51
 
63
 
52
 PLATFORMS
64
 PLATFORMS
53
   ruby
65
   ruby
64
   sequel (~> 5.30)
76
   sequel (~> 5.30)
65
   sinatra (~> 2.0)
77
   sinatra (~> 2.0)
66
   truncato (~> 0.7.11)
78
   truncato (~> 0.7.11)
79
+  vcr (~> 6.0)
80
+  webmock (~> 3.14)
67
 
81
 
68
 BUNDLED WITH
82
 BUNDLED WITH
69
    2.1.4
83
    2.1.4

+ 15
- 1
Rakefile View File

12
   migrate
12
   migrate
13
 end
13
 end
14
 
14
 
15
+task 'migrate_test' do
16
+  require_relative './db/migrate'
17
+
18
+  TEST_DB = Sequel.connect(
19
+    adapter: :postgres,
20
+    database: ENV['DB_DATABASE'] + '_test',
21
+    user: ENV['DB_USERNAME'],
22
+    password: ENV['DB_PASSWORD'],
23
+    logger: ENV['APP_ENV'] == 'development' ? Logger.new('db/log') : nil
24
+  )
25
+
26
+  migrate(db: TEST_DB)
27
+end
28
+
15
 task 'scrape' do
29
 task 'scrape' do
16
-  require_relative './db/scrape'
30
+  require_relative './db/scraper'
17
 
31
 
18
   should_log = ENV['APP_ENV'] == 'development' || ARGV.include?('--log')
32
   should_log = ENV['APP_ENV'] == 'development' || ARGV.include?('--log')
19
   scraper = Scraper.new(log: should_log)
33
   scraper = Scraper.new(log: should_log)

+ 1
- 2
db/migrate.rb View File

3
 
3
 
4
 require_relative 'connect'
4
 require_relative 'connect'
5
 
5
 
6
-def migrate
7
-  db = DB
6
+def migrate(db: DB)
8
   db.create_table? :threads do
7
   db.create_table? :threads do
9
     primary_key :id
8
     primary_key :id
10
     String :title
9
     String :title

db/scrape.rb → db/scraper.rb View File

13
     @first = first
13
     @first = first
14
     @last = last
14
     @last = last
15
     @log = log
15
     @log = log
16
+    @no_new_posts = false
16
 
17
 
17
     authenticate!
18
     authenticate!
18
   end
19
   end
19
 
20
 
20
   def scrape
21
   def scrape
21
     (@first..@last).each_with_index do |page_number, page_index|
22
     (@first..@last).each_with_index do |page_number, page_index|
22
-      page = Fetcher.new(cookie: @cookie).page(page_number)
23
-      threads = Parser.new.threads(page)
23
+      page = fetcher.page(page_number)
24
+      threads = parser.threads(page)
24
 
25
 
25
       threads.each do |t|
26
       threads.each do |t|
26
         next if page_index > 0 && t[:is_sticky]
27
         next if page_index > 0 && t[:is_sticky]
27
-        no_new_posts = scrape_thread(t)
28
+
29
+        scrape_thread(t)
30
+
28
         if no_new_posts
31
         if no_new_posts
29
-          next if t[:is_sticky]
30
-          return
32
+          if t[:is_sticky]
33
+            next
34
+          else
35
+            return
36
+          end
31
         end
37
         end
32
       end
38
       end
33
     end
39
     end
35
 
41
 
36
   private
42
   private
37
 
43
 
38
-  def authenticate!
39
-    @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
40
-
41
-    raise "Error logging into VLV. Check your credentials." if @cookie.nil?
42
-  end
44
+  attr_reader :cookie, :fetcher, :parser, :no_new_posts
43
 
45
 
44
   def scrape_thread(t)
46
   def scrape_thread(t)
47
+    @no_new_posts = false
48
+
45
     log t[:title]
49
     log t[:title]
46
 
50
 
47
-    page = Fetcher.new(cookie: @cookie).thread(t)
51
+    page = fetcher.thread(t)
48
     first_post = page.at_css('.postinfo:first-child')
52
     first_post = page.at_css('.postinfo:first-child')
49
     return false if first_post.nil?
53
     return false if first_post.nil?
50
 
54
 
51
     thread = DB.from(:threads).first(remote_id: t[:remote_id])
55
     thread = DB.from(:threads).first(remote_id: t[:remote_id])
52
     if thread.nil?
56
     if thread.nil?
53
-      t[:created_at] = Parser.new.thread_created_at(first_post)
54
-      thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky })
55
       log '  Inserting thread'
57
       log '  Inserting thread'
58
+      t[:created_at] = parser.thread_created_at(first_post)
59
+      thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky })
56
     end
60
     end
57
 
61
 
58
     scrape_posts(thread, page)
62
     scrape_posts(thread, page)
59
   end
63
   end
60
 
64
 
61
   def scrape_posts(thread, page)
65
   def scrape_posts(thread, page)
62
-    posts = Parser.new.posts(thread, page)
66
+    posts = parser.posts(thread, page)
63
     last_post = posts.last
67
     last_post = posts.last
64
-    unless DB.from(:posts).first(remote_id: last_post[:remote_id]).nil?
65
-      log '  No new posts'
66
-      return true
67
-    end
68
 
68
 
69
-    posts_count = posts.size
70
-    posts.each_with_index do |p, index|
71
-      msg = "  Inserting post #{index + 1}/#{posts_count}"
72
-      print msg if @log
73
-      if DB.from(:posts).first(remote_id: p[:remote_id]).nil?
74
-        VLV::Post.create(p)
69
+    if DB.from(:posts).first(remote_id: last_post[:remote_id]).nil?
70
+      db_posts = VLV::Post.where(remote_id: posts.map { |p| p[:remote_id] }).all
71
+
72
+      posts = posts.each_with_index.map do |post|
73
+        if db_posts.detect { |db_post| db_post.remote_id == post[:remote_id] }.nil?
74
+          post
75
+        end
75
       end
76
       end
76
-      print "\b" * msg.size unless index == posts_count - 1 if @log
77
+
78
+      log "  Inserting #{posts.size} posts"
79
+      VLV::Post.multi_insert(posts.compact)
80
+    else
81
+      no_new_posts!
82
+      log '  No new posts'
77
     end
83
     end
84
+  end
78
 
85
 
79
-    log ''
86
+  def no_new_posts!
87
+    @no_new_posts = true
88
+  end
89
+
90
+  def authenticate!
91
+    @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
92
+    raise "Error logging into VLV. Check your credentials." if @cookie.nil?
80
 
93
 
81
-    false
94
+    @fetcher = Fetcher.new(cookie: cookie)
95
+    @parser = Parser.new
82
   end
96
   end
83
 
97
 
84
   def log(msg)
98
   def log(msg)

+ 28
- 0
spec/scraper_spec.rb View File

1
+require 'dotenv/load'
2
+require 'spec_helper'
3
+
4
+require_relative '../db/scraper'
5
+
6
+RSpec.describe Scraper do
7
+  around(:each) do |example|
8
+    DB.transaction(rollback: :always, auto_savepoint: true) { example.run }
9
+  end
10
+
11
+  subject { described_class.new(log: true) }
12
+
13
+  describe '#initialize' do
14
+    it 'authenticates' do
15
+      expect(subject.send(:cookie)).to_not be_nil
16
+    end
17
+  end
18
+
19
+  describe '#scrape' do
20
+    it 'creates new threads and posts' do
21
+      VCR.use_cassette "create_new_threads_and_posts" do
22
+        expect { subject.scrape }
23
+          .to  change { VLV::Thread.count }.by(109)
24
+          .and change { VLV::Post.count   }.by_at_least(5000)
25
+      end
26
+    end
27
+  end
28
+end

+ 12
- 0
spec/spec_helper.rb View File

1
+require 'vcr'
2
+require "webmock/rspec"
3
+
4
+ENV["DB_DATABASE"] = ENV["DB_DATABASE"] + '_test'
5
+
6
+WebMock.disable_net_connect!(allow_localhost: true)
7
+
8
+VCR.configure do |c|
9
+  c.cassette_library_dir = "spec/vcr"
10
+  c.hook_into :webmock
11
+  c.ignore_localhost = true
12
+end

Loading…
Cancel
Save