Browse Source

Refactor parser and add specs

master
Dylan Baker 2 years ago
parent
commit
c978ff6241
3 changed files with 128 additions and 53 deletions
  1. 4
    5
      db/scrape.rb
  2. 6
    6
      lib/parser.rb
  3. 118
    42
      spec/parser_spec.rb

+ 4
- 5
db/scrape.rb View File

@@ -3,8 +3,8 @@ require 'sequel'
3 3
 
4 4
 require_relative '../db/connect'
5 5
 require_relative '../lib/auth'
6
-require_relative '../lib/parse'
7 6
 require_relative '../lib/fetcher'
7
+require_relative '../lib/parser'
8 8
 require_relative '../lib/models/post'
9 9
 require_relative '../lib/models/thread'
10 10
 
@@ -20,7 +20,7 @@ class Scraper
20 20
   def scrape
21 21
     (@first..@last).each_with_index do |page_number, page_index|
22 22
       page = Fetcher.new(cookie: @cookie).page(page_number)
23
-      threads = Parse.threads(page)
23
+      threads = Parser.new.threads(page)
24 24
 
25 25
       threads.each do |t|
26 26
         next if page_index > 0 && t[:is_sticky]
@@ -48,10 +48,9 @@ class Scraper
48 48
     first_post = page.at_css('.postinfo:first-child')
49 49
     return false if first_post.nil?
50 50
 
51
-    t[:created_at] = Parse.thread_created_at(first_post)
52
-
53 51
     thread = DB.from(:threads).first(remote_id: t[:remote_id])
54 52
     if thread.nil?
53
+      t[:created_at] = Parser.new.thread_created_at(first_post)
55 54
       thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky })
56 55
       log '  Inserting thread'
57 56
     end
@@ -60,7 +59,7 @@ class Scraper
60 59
   end
61 60
 
62 61
   def scrape_posts(thread, page)
63
-    posts = Parse.posts(thread, page)
62
+    posts = Parser.new.posts(thread, page)
64 63
     last_post = posts.last
65 64
     unless DB.from(:posts).first(remote_id: last_post[:remote_id]).nil?
66 65
       log '  No new posts'

lib/parse.rb → lib/parser.rb View File

@@ -1,7 +1,8 @@
1 1
 require 'nokogiri'
2
+require 'time'
2 3
 
3
-module Parse
4
-  def self.threads(page)
4
+class Parser
5
+  def threads(page)
5 6
     threads = Array.new
6 7
 
7 8
     page.css('.even, .odd').each do |row|
@@ -20,15 +21,14 @@ module Parse
20 21
     threads
21 22
   end
22 23
 
23
-  def self.posts(thread, page)
24
+  def posts(thread, page)
24 25
     posts = Array.new
25 26
 
26 27
     page.css('.post').each do |_post|
27 28
       post = Hash.new
28 29
       post[:remote_id] = _post.at_css('ul.view')[:id].split('_')[1].to_i
29 30
       post[:creator] = _post.at_css('.memberlink').text.strip
30
-      date, time =
31
-        _post.at_css('.postinfo').text.split('posted this')[1].split('@')
31
+      date, time = _post.at_css('.postinfo').text.split('posted this')[1].split('@')
32 32
       post[:created_at] = Time.parse("#{date} #{time}")
33 33
       post[:body] = _post.at_css('.postbody').children.map(&:to_html).join.strip
34 34
       post[:thread_id] = thread[:id]
@@ -38,7 +38,7 @@ module Parse
38 38
     posts
39 39
   end
40 40
 
41
-  def self.thread_created_at(first_post)
41
+  def thread_created_at(first_post)
42 42
     post_info = first_post.text.split('posted this')
43 43
     date, time = post_info[1].split('@')
44 44
     Time.parse("#{date} #{time}")

+ 118
- 42
spec/parser_spec.rb View File

@@ -1,47 +1,123 @@
1
-require_relative '../lib/parse'
1
+require_relative '../lib/parser'
2 2
 
3
-RSpec.describe 'Parser' do
4
-  it 'should parse threads' do
5
-    html = Nokogiri::HTML(<<~HTML)
6
-      <div class="even" id="thread_12345">
7
-        <ul class="list read">
8
-          <li class="member">
9
-            <span>Thread By: </span>
10
-            <a href="/member/view/creator1/" class="memberlink">creator1</a>
11
-          </li>
12
-          <li class="subject">
13
-            <span>Subject: </span>
14
-            <a href="/thread/view/12345/&p=999">
15
-              <strong>Sticky:</sticky> Thread title 1
16
-            </a>
17
-          </li>
18
-          <li class="posts"><span>Posts: </span>999</li>
19
-          <li class="lastpost">
20
-            <span>Last Post By:</span>
21
-            <a href="/member/view/lastposter1/" class="memberlink">lastposter1</a> on Fri&nbsp;Apr&nbsp;10&nbsp;2020&nbsp;01:23&nbsp;am</li>
22
-        </ul>
23
-      </div>
24
-      <div class="even" id="thread_123456">
25
-        <ul class="list read">
26
-          <li class="member">
27
-            <span>Thread By: </span>
28
-            <a href="/member/view/creator2/" class="memberlink">creator2</a>
29
-          </li>
30
-          <li class="subject">
31
-            <span>Subject: </span>
32
-            <a href="/thread/view/123456/&p=999">Thread title 2</a>
33
-          </li>
34
-          <li class="posts"><span>Posts: </span>999</li>
35
-          <li class="lastpost">
36
-            <span>Last Post By:</span>
37
-            <a href="/member/view/lastposter2/" class="memberlink">lastposter2</a> on Fri&nbsp;Apr&nbsp;10&nbsp;2020&nbsp;01:23&nbsp;am</li>
38
-        </ul>
3
+RSpec.describe Parser do
4
+  subject { Parser.new }
5
+
6
+  describe '#threads' do
7
+    let(:html) do
8
+      Nokogiri::HTML(<<~HTML)
9
+        <div class="even" id="thread_12345">
10
+          <ul class="list read">
11
+            <li class="member">
12
+              <span>Thread By: </span>
13
+              <a href="/member/view/creator1/" class="memberlink">creator1</a>
14
+            </li>
15
+            <li class="subject">
16
+              <span>Subject: </span>
17
+              <a href="/thread/view/12345/&p=999">
18
+                <strong>Sticky:</sticky> Thread title 1
19
+              </a>
20
+            </li>
21
+            <li class="posts"><span>Posts: </span>999</li>
22
+            <li class="lastpost">
23
+              <span>Last Post By:</span>
24
+              <a href="/member/view/lastposter1/" class="memberlink">lastposter1</a> on Fri&nbsp;Apr&nbsp;10&nbsp;2020&nbsp;01:23&nbsp;am</li>
25
+          </ul>
26
+        </div>
27
+        <div class="even" id="thread_123456">
28
+          <ul class="list read">
29
+            <li class="member">
30
+              <span>Thread By: </span>
31
+              <a href="/member/view/creator2/" class="memberlink">creator2</a>
32
+            </li>
33
+            <li class="subject">
34
+              <span>Subject: </span>
35
+              <a href="/thread/view/123456/&p=999">Thread title 2</a>
36
+            </li>
37
+            <li class="posts"><span>Posts: </span>999</li>
38
+            <li class="lastpost">
39
+              <span>Last Post By:</span>
40
+              <a href="/member/view/lastposter2/" class="memberlink">lastposter2</a> on Fri&nbsp;Apr&nbsp;10&nbsp;2020&nbsp;01:23&nbsp;am</li>
41
+          </ul>
42
+        </div>
43
+      HTML
44
+    end
45
+
46
+    it 'parses threads' do
47
+      expect(subject.threads(html)).to eq([
48
+        {remote_id: '12345', title: 'Sticky: Thread title 1', creator: 'creator1', is_sticky: true},
49
+        {remote_id: '123456', title: 'Thread title 2', creator: 'creator2', is_sticky: false},
50
+      ])
51
+    end
52
+  end
53
+
54
+  describe '#posts' do
55
+    let(:html) do
56
+      Nokogiri::HTML(<<~HTML)
57
+        <div>
58
+          <div class="post">
59
+            <ul class="view" id="post_69">
60
+              <li class="info">
61
+                <div class="postinfo">
62
+                  <a class="memberlink" href="/member/view/User1">User1</a>
63
+                  posted this October 26th, 2021 @ 12:34:56 am
64
+                </div>
65
+              </li>
66
+              <li class="postbody">
67
+                This is the body of the first post
68
+              </li>
69
+            </ul>
70
+          </div>
71
+          <div class="post">
72
+            <ul class="view" id="post_420">
73
+              <li class="info">
74
+                <div class="postinfo">
75
+                  <a class="memberlink" href="/member/view/User2">User2</a>
76
+                  posted this October 27th, 2021 @ 12:34:56 am
77
+                </div>
78
+              </li>
79
+              <li class="postbody">
80
+                This is the body of the second post
81
+              </li>
82
+            </ul>
83
+          </div>
84
+        </div>
85
+      HTML
86
+    end
87
+
88
+    it 'parses posts' do
89
+      expect(subject.posts({ id: 666 }, html)).to match_array([
90
+        {
91
+          remote_id: 69,
92
+          creator: 'User1',
93
+          thread_id: 666,
94
+          created_at: Time.new(2021, 10, 26, 0, 34, 56),
95
+          body: 'This is the body of the first post'
96
+        },
97
+        {
98
+          remote_id: 420,
99
+          creator: 'User2',
100
+          thread_id: 666,
101
+          created_at: Time.new(2021, 10, 27, 0, 34, 56),
102
+          body: 'This is the body of the second post'
103
+        }
104
+      ])
105
+    end
106
+  end
107
+
108
+  describe '#thread_created_at' do
109
+    let(:html) do
110
+      Nokogiri::HTML(<<~HTML)
111
+      <div class="postinfo">
112
+        <a class="memberlink" href="/member/view/User1">User1</a>
113
+        posted this October 27th, 2021 @ 12:34:56 am
39 114
       </div>
40
-    HTML
115
+      HTML
116
+    end
41 117
 
42
-    expect(Parse.threads(html)).to eq([
43
-      {remote_id: '12345', title: 'Sticky: Thread title 1', creator: 'creator1', is_sticky: true},
44
-      {remote_id: '123456', title: 'Thread title 2', creator: 'creator2', is_sticky: false},
45
-    ])
118
+    it 'parses the timestamp of the first post' do
119
+      expect(subject.thread_created_at(html))
120
+        .to eq(Time.new(2021, 10, 27, 00, 34, 56))
121
+    end
46 122
   end
47 123
 end

Loading…
Cancel
Save