You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scraper.rb 2.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. require 'dotenv/load'
  2. require 'sequel'
  3. require_relative '../db/connect'
  4. require_relative '../lib/auth'
  5. require_relative '../lib/fetcher'
  6. require_relative '../lib/parser'
  7. require_relative '../lib/models/post'
  8. require_relative '../lib/models/thread'
  9. class Scraper
  10. def initialize(first: 0, last: 0, log: false)
  11. @first = first
  12. @last = last
  13. @log = log
  14. @no_new_posts = false
  15. authenticate!
  16. end
  17. def scrape
  18. (@first..@last).each_with_index do |page_number, page_index|
  19. page = fetcher.page(page_number)
  20. threads = parser.threads(page)
  21. threads.each do |t|
  22. next if page_index > 0 && t[:is_sticky]
  23. scrape_thread(t)
  24. if no_new_posts
  25. if t[:is_sticky]
  26. next
  27. else
  28. return
  29. end
  30. end
  31. end
  32. end
  33. end
  34. private
  35. attr_reader :cookie, :fetcher, :parser, :no_new_posts
  36. def scrape_thread(t)
  37. @no_new_posts = false
  38. log t[:title]
  39. page = fetcher.thread(t)
  40. first_post = page.at_css('.postinfo:first-child')
  41. return false if first_post.nil?
  42. thread = DB.from(:threads).first(remote_id: t[:remote_id])
  43. if thread.nil?
  44. log ' Inserting thread'
  45. t[:created_at] = parser.thread_created_at(first_post)
  46. thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky })
  47. end
  48. scrape_posts(thread, page)
  49. end
  50. def scrape_posts(thread, page)
  51. posts = parser.posts(thread, page)
  52. last_post = posts.last
  53. if DB.from(:posts).first(remote_id: last_post[:remote_id]).nil?
  54. db_posts = VLV::Post.where(remote_id: posts.map { |p| p[:remote_id] }).all
  55. posts = posts.each_with_index.map do |post|
  56. if db_posts.detect { |db_post| db_post.remote_id == post[:remote_id] }.nil?
  57. post
  58. end
  59. end
  60. log " Inserting #{posts.size} posts"
  61. VLV::Post.multi_insert(posts.compact)
  62. else
  63. no_new_posts!
  64. log ' No new posts'
  65. end
  66. end
  67. def no_new_posts!
  68. @no_new_posts = true
  69. end
  70. def authenticate!
  71. @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
  72. raise "Error logging into VLV. Check your credentials." if @cookie.nil?
  73. @fetcher = Fetcher.new(cookie: cookie)
  74. @parser = Parser.new
  75. end
  76. def log(msg)
  77. puts msg if @log
  78. end
  79. end