4 Commits

Author SHA1 Message Date
  Dylan Baker e46a9fa449 Parse stickiness in parse_threads 4 years ago
  Dylan Baker 2c524af608 Log in development 4 years ago
  Dylan Baker e1bbe85032 Add option to log or not 4 years ago
  Dylan Baker 210c1c5667 Fix breaking/sticky logic 4 years ago
2 changed files with 15 additions and 10 deletions
  1. 1
    1
      Rakefile
  2. 14
    9
      db/scrape.rb

+ 1
- 1
Rakefile View File

@@ -9,7 +9,7 @@ task 'migrate' do
9 9
 end
10 10
 
11 11
 task 'scrape' do
12
-  scrape
12
+  scrape(log: ENV['APP_ENV'] == 'development')
13 13
 end
14 14
 
15 15
 task 'build' do

+ 14
- 9
db/scrape.rb View File

@@ -16,6 +16,7 @@ def fetch_page(page_number, cookie)
16 16
 end
17 17
 
18 18
 def fetch_thread(thread, cookie)
19
+  sleep(0.5)
19 20
   url =
20 21
     URI(
21 22
       "http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true"
@@ -65,25 +66,28 @@ def parse_threads(page)
65 66
     thread_link = row.at_css('.subject > a')
66 67
     next if thread_link.nil?
67 68
     thread[:remote_id] = thread_link['href'].split('/')[3]
68
-    thread[:title] = thread_link.text
69
+    thread[:title] = thread_link.text.strip
69 70
     creator = row.at_css('.memberlink').text.strip
70 71
     creator = creator[0..-2] if creator.match(/\+$/)
71 72
     thread[:creator] = creator
73
+    thread[:is_sticky] = !!thread[:title].match(/^Sticky:/)
72 74
     threads << thread
73 75
   end
74 76
   threads
75 77
 end
76 78
 
77
-def scrape(first: 0, last: 0)
79
+def scrape(first: 0, last: 0, log: false)
78 80
   cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
79 81
   db = connect
80 82
 
81
-  (first..last).each do |page_number|
83
+  (first..last).each_with_index do |page_number, index|
82 84
     page = fetch_page(page_number, cookie)
83 85
     threads = parse_threads(page)
84 86
     threads.each do |t|
85
-      puts t[:title]
86
-      sleep(1)
87
+      next if index > 0 && t[:is_sticky]
88
+
89
+      puts t[:title] if log
90
+
87 91
       page = fetch_thread(t, cookie)
88 92
       first_post = page.at_css('.postinfo:first-child')
89 93
 
@@ -96,7 +100,7 @@ def scrape(first: 0, last: 0)
96 100
       thread = db.from(:threads).first(remote_id: t[:remote_id])
97 101
       is_new_thread = thread.nil?
98 102
       if is_new_thread
99
-        puts '  Inserting thread'
103
+        puts '  Inserting thread' if log
100 104
         id =
101 105
           db.from(:threads).insert(
102 106
             title: t[:title],
@@ -112,13 +116,14 @@ def scrape(first: 0, last: 0)
112 116
 
113 117
       last_post = posts.last
114 118
       unless db.from(:posts).first(remote_id: last_post[:remote_id]).nil?
119
+        next if t[:is_sticky]
115 120
         break
116 121
       end
117 122
 
118 123
       posts_count = posts.size
119 124
       posts.each_with_index do |p, index|
120 125
         msg = "  Inserting post #{index + 1}/#{posts_count}"
121
-        print msg
126
+        print msg if log
122 127
         if is_new_thread || db.from(:posts).first(remote_id: p[:remote_id]).nil?
123 128
           db.from(:posts).insert(
124 129
             body: p[:body],
@@ -128,10 +133,10 @@ def scrape(first: 0, last: 0)
128 133
             remote_id: p[:remote_id]
129 134
           )
130 135
         end
131
-        print "\b" * msg.size unless index == posts_count - 1
136
+        print "\b" * msg.size unless index == posts_count - 1 if log
132 137
       end
133 138
 
134
-      puts
139
+      puts if log
135 140
     end
136 141
   end
137 142
 end

Loading…
Cancel
Save