14 Commits

Author SHA1 Message Date
  Dylan Baker 48fe82bdbd Don't use full text search for exact matches 2 years ago
  Dylan Baker 64c0eaf641 Accidentally removed order clause 2 years ago
  Dylan Baker 4df2839f8e Add searching specs 2 years ago
  Dylan Baker 762c22f34f Specify language of search 2 years ago
  Dylan Baker a642c6bbeb Oops 2 years ago
  Dylan Baker db3bdae43f Add VCR to fetcher and scraper specs 2 years ago
  Dylan Baker ec8e45ba59 Merge branch 'specs' 2 years ago
  Dylan Baker 8bbc70cc5a Add exact match function 2 years ago
  Dylan Baker ed3e225aba Specs 2 years ago
  Dylan Baker 2c5c251e7b Rename scraper 2 years ago
  Dylan Baker db0d10b969 Refactor scraper 2 years ago
  Dylan Baker ef2fed5fc0 Allow passing DB into migration 2 years ago
  Dylan Baker c978ff6241 Refactor parser and add specs 2 years ago
  Dylan Baker 9198478704 Refactor fetcher, add specs 2 years ago
20 changed files with 419 additions and 152 deletions
  1. 1
    0
      .gitignore
  2. 2
    1
      Gemfile
  3. 14
    0
      Gemfile.lock
  4. 15
    1
      Rakefile
  5. 1
    0
      db/connect.rb
  6. 1
    2
      db/migrate.rb
  7. 43
    30
      db/scraper.rb
  8. 0
    24
      lib/fetch.rb
  9. 29
    0
      lib/fetcher.rb
  10. 2
    0
      lib/models/post.rb
  11. 2
    0
      lib/models/thread.rb
  12. 6
    6
      lib/parser.rb
  13. 43
    44
      lib/search.rb
  14. 36
    0
      spec/fetcher_spec.rb
  15. 118
    42
      spec/parser_spec.rb
  16. 30
    0
      spec/scraper_spec.rb
  17. 48
    0
      spec/search_spec.rb
  18. 12
    0
      spec/spec_helper.rb
  19. 1
    0
      web/server.rb
  20. 15
    2
      web/views/partials/search.erb

+ 1
- 0
.gitignore View File

@@ -3,3 +3,4 @@ vendor/
3 3
 web/public/*.css
4 4
 web/public/*.js
5 5
 db/log
6
+spec/vcr

+ 2
- 1
Gemfile View File

@@ -13,5 +13,6 @@ gem 'rspec', '~> 3.9'
13 13
 gem 'sassc', '~> 2.2'
14 14
 gem 'sequel', '~> 5.30'
15 15
 gem 'sinatra', '~> 2.0'
16
-
17 16
 gem "truncato", "~> 0.7.11"
17
+gem "vcr", "~> 6.0"
18
+gem "webmock", "~> 3.14"

+ 14
- 0
Gemfile.lock View File

@@ -1,9 +1,14 @@
1 1
 GEM
2 2
   remote: https://rubygems.org/
3 3
   specs:
4
+    addressable (2.8.0)
5
+      public_suffix (>= 2.0.2, < 5.0)
6
+    crack (0.4.5)
7
+      rexml
4 8
     diff-lcs (1.3)
5 9
     dotenv (2.7.5)
6 10
     ffi (1.12.2)
11
+    hashdiff (1.0.1)
7 12
     htmlentities (4.3.4)
8 13
     httparty (0.18.0)
9 14
       mime-types (~> 3.0)
@@ -18,10 +23,12 @@ GEM
18 23
     nokogiri (1.10.9)
19 24
       mini_portile2 (~> 2.4.0)
20 25
     pg (1.2.3)
26
+    public_suffix (4.0.6)
21 27
     rack (2.2.2)
22 28
     rack-protection (2.0.8.1)
23 29
       rack
24 30
     rake (13.0.1)
31
+    rexml (3.2.5)
25 32
     rspec (3.9.0)
26 33
       rspec-core (~> 3.9.0)
27 34
       rspec-expectations (~> 3.9.0)
@@ -48,6 +55,11 @@ GEM
48 55
     truncato (0.7.11)
49 56
       htmlentities (~> 4.3.1)
50 57
       nokogiri (>= 1.7.0, <= 2.0)
58
+    vcr (6.0.0)
59
+    webmock (3.14.0)
60
+      addressable (>= 2.8.0)
61
+      crack (>= 0.3.2)
62
+      hashdiff (>= 0.4.0, < 2.0.0)
51 63
 
52 64
 PLATFORMS
53 65
   ruby
@@ -64,6 +76,8 @@ DEPENDENCIES
64 76
   sequel (~> 5.30)
65 77
   sinatra (~> 2.0)
66 78
   truncato (~> 0.7.11)
79
+  vcr (~> 6.0)
80
+  webmock (~> 3.14)
67 81
 
68 82
 BUNDLED WITH
69 83
    2.1.4

+ 15
- 1
Rakefile View File

@@ -12,8 +12,22 @@ task 'migrate' do
12 12
   migrate
13 13
 end
14 14
 
15
+task 'migrate_test' do
16
+  require_relative './db/migrate'
17
+
18
+  TEST_DB = Sequel.connect(
19
+    adapter: :postgres,
20
+    database: ENV['DB_DATABASE'] + '_test',
21
+    user: ENV['DB_USERNAME'],
22
+    password: ENV['DB_PASSWORD'],
23
+    logger: ENV['APP_ENV'] == 'development' ? Logger.new('db/log') : nil
24
+  )
25
+
26
+  migrate(db: TEST_DB)
27
+end
28
+
15 29
 task 'scrape' do
16
-  require_relative './db/scrape'
30
+  require_relative './db/scraper'
17 31
 
18 32
   should_log = ENV['APP_ENV'] == 'development' || ARGV.include?('--log')
19 33
   scraper = Scraper.new(log: should_log)

+ 1
- 0
db/connect.rb View File

@@ -1,5 +1,6 @@
1 1
 require 'dotenv'
2 2
 require 'logger'
3
+require 'sequel'
3 4
 
4 5
 Dotenv.load(File.expand_path('../.env'))
5 6
 

+ 1
- 2
db/migrate.rb View File

@@ -3,8 +3,7 @@ require 'sequel'
3 3
 
4 4
 require_relative 'connect'
5 5
 
6
-def migrate
7
-  db = DB
6
+def migrate(db: DB)
8 7
   db.create_table? :threads do
9 8
     primary_key :id
10 9
     String :title

db/scrape.rb → db/scraper.rb View File

@@ -3,8 +3,8 @@ require 'sequel'
3 3
 
4 4
 require_relative '../db/connect'
5 5
 require_relative '../lib/auth'
6
-require_relative '../lib/fetch'
7
-require_relative '../lib/parse'
6
+require_relative '../lib/fetcher'
7
+require_relative '../lib/parser'
8 8
 require_relative '../lib/models/post'
9 9
 require_relative '../lib/models/thread'
10 10
 
@@ -13,21 +13,27 @@ class Scraper
13 13
     @first = first
14 14
     @last = last
15 15
     @log = log
16
+    @no_new_posts = false
16 17
 
17 18
     authenticate!
18 19
   end
19 20
 
20 21
   def scrape
21 22
     (@first..@last).each_with_index do |page_number, page_index|
22
-      page = Fetch.page(page_number, @cookie)
23
-      threads = Parse.threads(page)
23
+      page = fetcher.page(page_number)
24
+      threads = parser.threads(page)
24 25
 
25 26
       threads.each do |t|
26 27
         next if page_index > 0 && t[:is_sticky]
27
-        no_new_posts = scrape_thread(t)
28
+
29
+        scrape_thread(t)
30
+
28 31
         if no_new_posts
29
-          next if t[:is_sticky]
30
-          return
32
+          if t[:is_sticky]
33
+            next
34
+          else
35
+            return
36
+          end
31 37
         end
32 38
       end
33 39
     end
@@ -35,51 +41,58 @@ class Scraper
35 41
 
36 42
   private
37 43
 
38
-  def authenticate!
39
-    @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
40
-
41
-    raise "Error logging into VLV. Check your credentials." if @cookie.nil?
42
-  end
44
+  attr_reader :cookie, :fetcher, :parser, :no_new_posts
43 45
 
44 46
   def scrape_thread(t)
47
+    @no_new_posts = false
48
+
45 49
     log t[:title]
46 50
 
47
-    page = Fetch.thread(t, @cookie)
51
+    page = fetcher.thread(t)
48 52
     first_post = page.at_css('.postinfo:first-child')
49 53
     return false if first_post.nil?
50 54
 
51
-    t[:created_at] = Parse.thread_created_at(first_post)
52
-
53 55
     thread = DB.from(:threads).first(remote_id: t[:remote_id])
54 56
     if thread.nil?
55
-      thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky })
56 57
       log '  Inserting thread'
58
+      t[:created_at] = parser.thread_created_at(first_post)
59
+      thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky })
57 60
     end
58 61
 
59 62
     scrape_posts(thread, page)
60 63
   end
61 64
 
62 65
   def scrape_posts(thread, page)
63
-    posts = Parse.posts(thread, page)
66
+    posts = parser.posts(thread, page)
64 67
     last_post = posts.last
65
-    unless DB.from(:posts).first(remote_id: last_post[:remote_id]).nil?
66
-      log '  No new posts'
67
-      return true
68
-    end
69 68
 
70
-    posts_count = posts.size
71
-    posts.each_with_index do |p, index|
72
-      msg = "  Inserting post #{index + 1}/#{posts_count}"
73
-      print msg if @log
74
-      if DB.from(:posts).first(remote_id: p[:remote_id]).nil?
75
-        VLV::Post.create(p)
69
+    if DB.from(:posts).first(remote_id: last_post[:remote_id]).nil?
70
+      db_posts = VLV::Post.where(remote_id: posts.map { |p| p[:remote_id] }).all
71
+
72
+      posts = posts.each_with_index.map do |post|
73
+        if db_posts.detect { |db_post| db_post.remote_id == post[:remote_id] }.nil?
74
+          post
75
+        end
76 76
       end
77
-      print "\b" * msg.size unless index == posts_count - 1 if @log
77
+
78
+      log "  Inserting #{posts.size} posts"
79
+      VLV::Post.multi_insert(posts.compact)
80
+    else
81
+      no_new_posts!
82
+      log '  No new posts'
78 83
     end
84
+  end
79 85
 
80
-    log ''
86
+  def no_new_posts!
87
+    @no_new_posts = true
88
+  end
89
+
90
+  def authenticate!
91
+    @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
92
+    raise "Error logging into VLV. Check your credentials." if @cookie.nil?
81 93
 
82
-    false
94
+    @fetcher = Fetcher.new(cookie: cookie)
95
+    @parser = Parser.new
83 96
   end
84 97
 
85 98
   def log(msg)

+ 0
- 24
lib/fetch.rb View File

@@ -1,24 +0,0 @@
1
-require 'net/http'
2
-
3
-module Fetch
4
-  def self.page(page_number, cookie)
5
-    url = URI("http://board.vivalavinyl.com/thread/list/#{page_number}")
6
-    http = Net::HTTP.new(url.host, url.port)
7
-    request = Net::HTTP::Get.new(url)
8
-    request['cookie'] = cookie
9
-    response = http.request(request)
10
-    Nokogiri.HTML(response.body)
11
-  end
12
-
13
-  def self.thread(thread, cookie)
14
-    url =
15
-      URI(
16
-        "http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true"
17
-      )
18
-    http = Net::HTTP.new(url.host, url.port)
19
-    request = Net::HTTP::Get.new(url)
20
-    request['cookie'] = cookie
21
-    response = http.request(request)
22
-    Nokogiri.HTML(response.body)
23
-  end
24
-end

+ 29
- 0
lib/fetcher.rb View File

@@ -0,0 +1,29 @@
1
+require 'net/http'
2
+require 'nokogiri'
3
+
4
+class Fetcher
5
+  BASE_URL = 'http://board.vivalavinyl.com'
6
+
7
+  def initialize(cookie:)
8
+    @cookie = cookie
9
+  end
10
+
11
+  def page(page_number)
12
+    authenticated_request("/thread/list/#{page_number}")
13
+  end
14
+
15
+  def thread(thread)
16
+    authenticated_request("/thread/view/#{thread[:remote_id]}&ajax=true")
17
+  end
18
+
19
+  private
20
+
21
+  def authenticated_request(path)
22
+    uri = URI("#{BASE_URL}#{path}")
23
+    http = Net::HTTP.new(uri.host, uri.port)
24
+    request = Net::HTTP::Get.new(uri)
25
+    request['cookie'] = @cookie
26
+    response = http.request(request)
27
+    Nokogiri.HTML(response.body)
28
+  end
29
+end

+ 2
- 0
lib/models/post.rb View File

@@ -1,3 +1,5 @@
1
+require 'sequel'
2
+
1 3
 module VLV
2 4
   class Post < Sequel::Model
3 5
     many_to_one :thread

+ 2
- 0
lib/models/thread.rb View File

@@ -1,3 +1,5 @@
1
+require 'sequel'
2
+
1 3
 module VLV
2 4
   class Thread < Sequel::Model
3 5
     one_to_many :posts

lib/parse.rb → lib/parser.rb View File

@@ -1,7 +1,8 @@
1 1
 require 'nokogiri'
2
+require 'time'
2 3
 
3
-module Parse
4
-  def self.threads(page)
4
+class Parser
5
+  def threads(page)
5 6
     threads = Array.new
6 7
 
7 8
     page.css('.even, .odd').each do |row|
@@ -20,15 +21,14 @@ module Parse
20 21
     threads
21 22
   end
22 23
 
23
-  def self.posts(thread, page)
24
+  def posts(thread, page)
24 25
     posts = Array.new
25 26
 
26 27
     page.css('.post').each do |_post|
27 28
       post = Hash.new
28 29
       post[:remote_id] = _post.at_css('ul.view')[:id].split('_')[1].to_i
29 30
       post[:creator] = _post.at_css('.memberlink').text.strip
30
-      date, time =
31
-        _post.at_css('.postinfo').text.split('posted this')[1].split('@')
31
+      date, time = _post.at_css('.postinfo').text.split('posted this')[1].split('@')
32 32
       post[:created_at] = Time.parse("#{date} #{time}")
33 33
       post[:body] = _post.at_css('.postbody').children.map(&:to_html).join.strip
34 34
       post[:thread_id] = thread[:id]
@@ -38,7 +38,7 @@ module Parse
38 38
     posts
39 39
   end
40 40
 
41
-  def self.thread_created_at(first_post)
41
+  def thread_created_at(first_post)
42 42
     post_info = first_post.text.split('posted this')
43 43
     date, time = post_info[1].split('@')
44 44
     Time.parse("#{date} #{time}")

+ 43
- 44
lib/search.rb View File

@@ -1,5 +1,8 @@
1 1
 require 'sequel'
2 2
 
3
+require_relative './models/post'
4
+require_relative './models/thread'
5
+
3 6
 RESULTS_PER_PAGE = 50
4 7
 
5 8
 def search(params)
@@ -8,6 +11,8 @@ def search(params)
8 11
   username = params[:username].strip
9 12
   from_date = params[:from_date].strip
10 13
   to_date = params[:to_date].strip
14
+  sort = params[:sort].strip
15
+  exact_match = params[:exact_match].strip == "yes"
11 16
 
12 17
   errors = Array.new
13 18
 
@@ -39,18 +44,9 @@ def search(params)
39 44
 
40 45
   results = case params[:type]
41 46
   when 'threads'
42
-    sort =
43
-      case params[:sort]
44
-      when 'thread'
45
-        'created_at DESC'
46
-      when 'post'
47
-        'last_post_created_at DESC'
48
-      else
49
-        'created_at DESC'
50
-      end
51
-    search_threads(query, username, from_date, to_date, sort, offset)
47
+    search_threads(query, username, from_date, to_date, sort, offset, exact_match)
52 48
   when 'posts'
53
-    search_posts(query, username, from_date, to_date, offset)
49
+    search_posts(query, username, from_date, to_date, offset, exact_match)
54 50
   else
55 51
     Array.new
56 52
   end
@@ -58,39 +54,42 @@ def search(params)
58 54
   {results: results, errors: errors}
59 55
 end
60 56
 
61
-def search_threads(query, username, from_date, to_date, sort, offset)
62
-  DB[<<-SQL, query, username, username, from_date, from_date, to_date, to_date, offset]
63
-    SELECT
64
-      threads.*,
65
-      count(*) OVER() AS full_count
66
-    FROM threads
67
-    WHERE
68
-      to_tsvector(title) @@ plainto_tsquery(?)
69
-      AND (LOWER(threads.creator) = LOWER(?) OR ? = '')
70
-      AND (created_at >= ? OR ? IS NULL)
71
-      AND (created_at <= ? OR ? IS NULL)
72
-    ORDER BY #{sort}
73
-    LIMIT #{RESULTS_PER_PAGE}
74
-    OFFSET ?;
75
-  SQL
57
+def search_threads(q, username, from_date, to_date, sort, offset, exact_match)
58
+  sort = Sequel.desc(sort == 'post' ? :last_post_created_at : :created_at)
59
+  query = VLV::Thread
60
+    .select(Sequel.lit('threads.*, count(*) OVER() AS full_count'))
61
+    .where(Sequel.lit("(LOWER(threads.creator) = LOWER(?) OR ? = '')", username, username))
62
+    .where(Sequel.lit("created_at >= ? OR ? IS NULL", from_date, from_date))
63
+    .where(Sequel.lit("created_at >= ? OR ? IS NULL", to_date, to_date))
64
+    .order(sort)
65
+    .limit(RESULTS_PER_PAGE)
66
+    .offset(Sequel.lit('?', offset))
67
+
68
+  if exact_match
69
+    query.where(Sequel.ilike(:title, "%#{q}%"))
70
+  else
71
+    query.full_text_search(:title, Sequel.lit("websearch_to_tsquery(?)", q), tsquery: true, language: 'english')
72
+  end
76 73
 end
77 74
 
78
-def search_posts(query, username, from_date, to_date, offset)
79
-  DB[<<-SQL, query, username, username, from_date, from_date, to_date, to_date, offset]
80
-    SELECT
81
-      posts.*,
82
-      threads.title as thread_title,
83
-      threads.remote_id as remote_thread_id,
84
-      count(*) OVER() AS full_count
85
-    FROM posts
86
-    INNER JOIN threads on posts.thread_id = threads.id
87
-    WHERE
88
-      tsv @@ plainto_tsquery(?)
89
-      AND ((LOWER(posts.creator) = LOWER(?)) OR (? = ''))
90
-      AND (posts.created_at >= ? OR ? IS NULL)
91
-      AND (posts.created_at <= ? OR ? IS NULL)
92
-    ORDER BY posts.created_at DESC
93
-    LIMIT #{RESULTS_PER_PAGE}
94
-    OFFSET ?;
95
-  SQL
75
+def search_posts(q, username, from_date, to_date, offset, exact_match)
76
+  query = VLV::Post
77
+    .select(Sequel.lit('posts.*, threads.title as thread_title, threads.remote_id as remote_thread_id, count(*) OVER() AS full_count'))
78
+    .join(Sequel.lit('threads on posts.thread_id = threads.id'))
79
+    .where(Sequel.lit("(LOWER(posts.creator) = LOWER(?) OR (? = ''))", username, username))
80
+    .where(Sequel.lit("posts.created_at >= ? OR ? IS NULL", from_date, from_date))
81
+    .where(Sequel.lit("posts.created_at >= ? OR ? IS NULL", to_date, to_date))
82
+    .limit(RESULTS_PER_PAGE)
83
+    .offset(Sequel.lit('?', offset))
84
+    .order(Sequel.desc(Sequel.lit('posts.created_at')))
85
+
86
+  if exact_match
87
+    query.where(Sequel.ilike(:body, "%#{q}%"))
88
+  else
89
+    query.full_text_search(:tsv, Sequel.lit("websearch_to_tsquery(?)", q), {
90
+      tsquery: true,
91
+      tsvector: true,
92
+      language: 'english'
93
+    })
94
+  end
96 95
 end

+ 36
- 0
spec/fetcher_spec.rb View File

@@ -0,0 +1,36 @@
1
+require 'dotenv/load'
2
+require_relative '../lib/auth'
3
+require_relative '../lib/fetcher'
4
+
5
+RSpec.describe Fetcher do
6
+  let(:cookie) { login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD']) }
7
+  let(:thread_id) { 49668 }
8
+
9
+  subject { Fetcher.new(cookie: cookie) }
10
+
11
+  describe '#thread' do
12
+    it "fetches a thread" do
13
+      VCR.use_cassette "fetch_a_thread" do
14
+        result = subject.thread({ remote_id: 49668 })
15
+        aggregate_failures do
16
+          expect(result).to be_a(Nokogiri::HTML::Document)
17
+          expect(result.at_css('.post:first-child').text)
18
+            .to include('reddwarf posted this March 30th, 2020 @ 3:29:23 am')
19
+        end
20
+      end
21
+    end
22
+  end
23
+
24
+  describe '#page' do
25
+    it "fetches a page of threads" do
26
+      VCR.use_cassette "fetch_a_page_of_threads" do
27
+        result = subject.page(0)
28
+        aggregate_failures do
29
+          expect(result).to be_a(Nokogiri::HTML::Document)
30
+          expect(result.at_css('h3').text).to eq('VLV: politics, food, and aging')
31
+          expect(result.css('.even, .odd').length).to eq(110)
32
+        end
33
+      end
34
+    end
35
+  end
36
+end

+ 118
- 42
spec/parser_spec.rb View File

@@ -1,47 +1,123 @@
1
-require_relative '../lib/parse'
1
+require_relative '../lib/parser'
2 2
 
3
-RSpec.describe 'Parser' do
4
-  it 'should parse threads' do
5
-    html = Nokogiri::HTML(<<~HTML)
6
-      <div class="even" id="thread_12345">
7
-        <ul class="list read">
8
-          <li class="member">
9
-            <span>Thread By: </span>
10
-            <a href="/member/view/creator1/" class="memberlink">creator1</a>
11
-          </li>
12
-          <li class="subject">
13
-            <span>Subject: </span>
14
-            <a href="/thread/view/12345/&p=999">
15
-              <strong>Sticky:</sticky> Thread title 1
16
-            </a>
17
-          </li>
18
-          <li class="posts"><span>Posts: </span>999</li>
19
-          <li class="lastpost">
20
-            <span>Last Post By:</span>
21
-            <a href="/member/view/lastposter1/" class="memberlink">lastposter1</a> on Fri&nbsp;Apr&nbsp;10&nbsp;2020&nbsp;01:23&nbsp;am</li>
22
-        </ul>
23
-      </div>
24
-      <div class="even" id="thread_123456">
25
-        <ul class="list read">
26
-          <li class="member">
27
-            <span>Thread By: </span>
28
-            <a href="/member/view/creator2/" class="memberlink">creator2</a>
29
-          </li>
30
-          <li class="subject">
31
-            <span>Subject: </span>
32
-            <a href="/thread/view/123456/&p=999">Thread title 2</a>
33
-          </li>
34
-          <li class="posts"><span>Posts: </span>999</li>
35
-          <li class="lastpost">
36
-            <span>Last Post By:</span>
37
-            <a href="/member/view/lastposter2/" class="memberlink">lastposter2</a> on Fri&nbsp;Apr&nbsp;10&nbsp;2020&nbsp;01:23&nbsp;am</li>
38
-        </ul>
3
+RSpec.describe Parser do
4
+  subject { Parser.new }
5
+
6
+  describe '#threads' do
7
+    let(:html) do
8
+      Nokogiri::HTML(<<~HTML)
9
+        <div class="even" id="thread_12345">
10
+          <ul class="list read">
11
+            <li class="member">
12
+              <span>Thread By: </span>
13
+              <a href="/member/view/creator1/" class="memberlink">creator1</a>
14
+            </li>
15
+            <li class="subject">
16
+              <span>Subject: </span>
17
+              <a href="/thread/view/12345/&p=999">
18
+                <strong>Sticky:</sticky> Thread title 1
19
+              </a>
20
+            </li>
21
+            <li class="posts"><span>Posts: </span>999</li>
22
+            <li class="lastpost">
23
+              <span>Last Post By:</span>
24
+              <a href="/member/view/lastposter1/" class="memberlink">lastposter1</a> on Fri&nbsp;Apr&nbsp;10&nbsp;2020&nbsp;01:23&nbsp;am</li>
25
+          </ul>
26
+        </div>
27
+        <div class="even" id="thread_123456">
28
+          <ul class="list read">
29
+            <li class="member">
30
+              <span>Thread By: </span>
31
+              <a href="/member/view/creator2/" class="memberlink">creator2</a>
32
+            </li>
33
+            <li class="subject">
34
+              <span>Subject: </span>
35
+              <a href="/thread/view/123456/&p=999">Thread title 2</a>
36
+            </li>
37
+            <li class="posts"><span>Posts: </span>999</li>
38
+            <li class="lastpost">
39
+              <span>Last Post By:</span>
40
+              <a href="/member/view/lastposter2/" class="memberlink">lastposter2</a> on Fri&nbsp;Apr&nbsp;10&nbsp;2020&nbsp;01:23&nbsp;am</li>
41
+          </ul>
42
+        </div>
43
+      HTML
44
+    end
45
+
46
+    it 'parses threads' do
47
+      expect(subject.threads(html)).to eq([
48
+        {remote_id: '12345', title: 'Sticky: Thread title 1', creator: 'creator1', is_sticky: true},
49
+        {remote_id: '123456', title: 'Thread title 2', creator: 'creator2', is_sticky: false},
50
+      ])
51
+    end
52
+  end
53
+
54
+  describe '#posts' do
55
+    let(:html) do
56
+      Nokogiri::HTML(<<~HTML)
57
+        <div>
58
+          <div class="post">
59
+            <ul class="view" id="post_69">
60
+              <li class="info">
61
+                <div class="postinfo">
62
+                  <a class="memberlink" href="/member/view/User1">User1</a>
63
+                  posted this October 26th, 2021 @ 12:34:56 am
64
+                </div>
65
+              </li>
66
+              <li class="postbody">
67
+                This is the body of the first post
68
+              </li>
69
+            </ul>
70
+          </div>
71
+          <div class="post">
72
+            <ul class="view" id="post_420">
73
+              <li class="info">
74
+                <div class="postinfo">
75
+                  <a class="memberlink" href="/member/view/User2">User2</a>
76
+                  posted this October 27th, 2021 @ 12:34:56 am
77
+                </div>
78
+              </li>
79
+              <li class="postbody">
80
+                This is the body of the second post
81
+              </li>
82
+            </ul>
83
+          </div>
84
+        </div>
85
+      HTML
86
+    end
87
+
88
+    it 'parses posts' do
89
+      expect(subject.posts({ id: 666 }, html)).to match_array([
90
+        {
91
+          remote_id: 69,
92
+          creator: 'User1',
93
+          thread_id: 666,
94
+          created_at: Time.new(2021, 10, 26, 0, 34, 56),
95
+          body: 'This is the body of the first post'
96
+        },
97
+        {
98
+          remote_id: 420,
99
+          creator: 'User2',
100
+          thread_id: 666,
101
+          created_at: Time.new(2021, 10, 27, 0, 34, 56),
102
+          body: 'This is the body of the second post'
103
+        }
104
+      ])
105
+    end
106
+  end
107
+
108
+  describe '#thread_created_at' do
109
+    let(:html) do
110
+      Nokogiri::HTML(<<~HTML)
111
+      <div class="postinfo">
112
+        <a class="memberlink" href="/member/view/User1">User1</a>
113
+        posted this October 27th, 2021 @ 12:34:56 am
39 114
       </div>
40
-    HTML
115
+      HTML
116
+    end
41 117
 
42
-    expect(Parse.threads(html)).to eq([
43
-      {remote_id: '12345', title: 'Sticky: Thread title 1', creator: 'creator1', is_sticky: true},
44
-      {remote_id: '123456', title: 'Thread title 2', creator: 'creator2', is_sticky: false},
45
-    ])
118
+    it 'parses the timestamp of the first post' do
119
+      expect(subject.thread_created_at(html))
120
+        .to eq(Time.new(2021, 10, 27, 00, 34, 56))
121
+    end
46 122
   end
47 123
 end

+ 30
- 0
spec/scraper_spec.rb View File

@@ -0,0 +1,30 @@
1
+require 'dotenv/load'
2
+require 'spec_helper'
3
+
4
+require_relative '../db/scraper'
5
+
6
+RSpec.describe Scraper do
7
+  around(:each) do |example|
8
+    DB.transaction(rollback: :always, auto_savepoint: true) { example.run }
9
+  end
10
+
11
+  subject { described_class.new(log: true) }
12
+
13
+  describe '#initialize' do
14
+    it 'authenticates' do
15
+      VCR.use_cassette "authentication" do
16
+        expect(subject.send(:cookie)).to_not be_nil
17
+      end
18
+    end
19
+  end
20
+
21
+  describe '#scrape' do
22
+    it 'creates new threads and posts' do
23
+      VCR.use_cassette "create_new_threads_and_posts" do
24
+        expect { subject.scrape }
25
+          .to  change { VLV::Thread.count }.by(109)
26
+          .and change { VLV::Post.count   }.by_at_least(5000)
27
+      end
28
+    end
29
+  end
30
+end

+ 48
- 0
spec/search_spec.rb View File

@@ -0,0 +1,48 @@
1
+require 'dotenv/load'
2
+require 'spec_helper'
3
+
4
+require_relative '../db/connect'
5
+require_relative '../lib/search'
6
+require_relative '../lib/models/post'
7
+require_relative '../lib/models/thread'
8
+
9
+RSpec.describe 'Search' do
10
+  around(:each) do |example|
11
+    DB.transaction(rollback: :always, auto_savepoint: true) { example.run }
12
+  end
13
+
14
+  before do
15
+    @thread1 = VLV::Thread.create(title: "This is a thread with many words in the title")
16
+    @thread2 = VLV::Thread.create(title: "Thread words")
17
+    @post1   = VLV::Post.create(body: "This is a post with many words in the body", thread_id: @thread1.id)
18
+    @post2   = VLV::Post.create(body: "Post words", thread_id: @thread2.id)
19
+  end
20
+
21
+  describe '#search_threads' do
22
+    it "finds threads with matching words" do
23
+      threads = search_threads('thread words', '', nil, nil, nil, nil, false)
24
+      expect(threads.map(&:title)).to match_array([@thread1.title, @thread2.title])
25
+    end
26
+
27
+    context "with exact_match" do
28
+      it "only finds threads with exactly matching phrases" do
29
+        threads = search_threads('thread words', '', nil, nil, nil, nil, true)
30
+        expect(threads.map(&:title)).to match_array([@thread2.title])
31
+      end
32
+    end
33
+  end
34
+
35
+  describe '#search_posts' do
36
+    it "finds posts with matching words" do
37
+      posts = search_posts('post words', '', nil, nil, nil, false)
38
+      expect(posts.map(&:body)).to match_array([@post1.body, @post2.body])
39
+    end
40
+
41
+    context "with exact_match" do
42
+      it "only finds posts with exactly matching phrases" do
43
+        posts = search_posts('post words', '', nil, nil, nil, true)
44
+        expect(posts.map(&:body)).to match_array([@post2.body])
45
+      end
46
+    end
47
+  end
48
+end

+ 12
- 0
spec/spec_helper.rb View File

@@ -0,0 +1,12 @@
1
+require 'vcr'
2
+require "webmock/rspec"
3
+
4
+ENV["DB_DATABASE"] = ENV["DB_DATABASE"] + '_test'
5
+
6
+WebMock.disable_net_connect!(allow_localhost: true)
7
+
8
+VCR.configure do |c|
9
+  c.cassette_library_dir = "spec/vcr"
10
+  c.hook_into :webmock
11
+  c.ignore_localhost = true
12
+end

+ 1
- 0
web/server.rb View File

@@ -37,6 +37,7 @@ class VLVSearch < Sinatra::Base
37 37
     params[:username] = String.new unless params[:username]
38 38
     params[:from_date] = String.new unless params[:from_date]
39 39
     params[:to_date] = String.new unless params[:to_date]
40
+    params[:exact_match] = String.new unless params[:exact_match]
40 41
 
41 42
     results = search(params)
42 43
 

+ 15
- 2
web/views/partials/search.erb View File

@@ -2,10 +2,21 @@
2 2
   <input name="q" type="search" value="<%= params[:q] %>" placeholder="Search for..." class="form__search" required>
3 3
   <div class="filters">
4 4
     <div class="filters__section">
5
+      <div class="filters__subsection">
6
+        <p>Exact match?</p>
7
+        <label class="form__label">
8
+          <input type="radio" name="exact_match" value="yes" <% if [nil,true, "yes"].include?(params[:exact_match]) %>checked<% end %>>
9
+          Yes
10
+        </label>
11
+        <label class="form__label">
12
+          <input type="radio" name="exact_match" value="no" <% if [false, "no"].include?(params[:exact_match]) %>checked<% end %>>
13
+          No
14
+        </label>
15
+      </div>
5 16
       <div class="filters__subsection">
6 17
         <p>Search in:</p>
7 18
         <label class="form__label">
8
-          <input type="radio" name="type" value="threads" <% if [nil, "threads"].include?(params[:type]) %>checked<% end%>>
19
+          <input type="radio" name="type" value="threads" <% if [nil, "threads"].include?(params[:type]) %>checked<% end %>>
9 20
           Threads
10 21
         </label>
11 22
         <label class="form__label">
@@ -13,6 +24,8 @@
13 24
           Posts
14 25
         </label>
15 26
       </div>
27
+    </div>
28
+    <div class="filters__section">
16 29
       <div class="filters__subsection">
17 30
         <p>From Date <em>(YYYY-MM-DD)</em>:</p>
18 31
         <label class="form__label">
@@ -48,7 +61,7 @@
48 61
       <div class="filters__subsection filters__subsection--sort <%= params[:type] == 'threads' ? 'open' : '' %>">
49 62
         <p>Sort by:</p>
50 63
         <label class="form__label">
51
-          <input type="radio" name="sort" value="thread" <% if [nil, "thread"].include? params[:sort] %>checked<% end%>>
64
+          <input type="radio" name="sort" value="thread" <% if [nil, "thread"].include? params[:sort] %>checked<% end %>>
52 65
           Thread creation
53 66
         </label>
54 67
         <label class="form__label">

Loading…
Cancel
Save