You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scrape.rb 1.8KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. require 'dotenv/load'
  2. require 'sequel'
  3. require_relative '../db/connect'
  4. require_relative '../lib/auth'
  5. require_relative '../lib/fetch'
  6. require_relative '../lib/insert'
  7. require_relative '../lib/parse'
  8. class Scraper
  9. def initialize(first: 0, last: 0, log: false)
  10. @first = first
  11. @last = last
  12. @log = log
  13. @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
  14. @db = connect
  15. end
  16. def scrape
  17. (@first..@last).each_with_index do |page_number, page_index|
  18. page = Fetch.page(page_number, @cookie)
  19. threads = Parse.threads(page)
  20. threads.each do |t|
  21. next if page_index > 0 && t[:is_sticky]
  22. no_new_posts = scrape_thread(t)
  23. if no_new_posts
  24. next if t[:is_sticky]
  25. return
  26. end
  27. end
  28. end
  29. end
  30. def scrape_thread(t)
  31. log t[:title]
  32. page = Fetch.thread(t, @cookie)
  33. first_post = page.at_css('.postinfo:first-child')
  34. return false if first_post.nil?
  35. t[:created_at] = Parse.thread_created_at(first_post)
  36. thread = @db.from(:threads).first(remote_id: t[:remote_id])
  37. if thread.nil?
  38. thread = Insert.thread(t, @db)
  39. log ' Inserting thread'
  40. end
  41. scrape_posts(thread, page)
  42. end
  43. def scrape_posts(thread, page)
  44. posts = Parse.posts(thread, page)
  45. last_post = posts.last
  46. unless @db.from(:posts).first(remote_id: last_post[:remote_id]).nil?
  47. log ' No new posts'
  48. return true
  49. end
  50. posts_count = posts.size
  51. posts.each_with_index do |p, index|
  52. msg = " Inserting post #{index + 1}/#{posts_count}"
  53. print msg if @log
  54. if @db.from(:posts).first(remote_id: p[:remote_id]).nil?
  55. Insert.post(p, @db)
  56. end
  57. print "\b" * msg.size unless index == posts_count - 1 if @log
  58. end
  59. log ''
  60. false
  61. end
  62. def log(msg)
  63. puts msg if @log
  64. end
  65. end