require 'dotenv/load' require 'sequel' require_relative '../db/connect' require_relative '../lib/auth' require_relative '../lib/fetcher' require_relative '../lib/parser' require_relative '../lib/models/post' require_relative '../lib/models/thread' class Scraper def initialize(first: 0, last: 0, log: false) @first = first @last = last @log = log @no_new_posts = false authenticate! end def scrape (@first..@last).each_with_index do |page_number, page_index| page = fetcher.page(page_number) threads = parser.threads(page) threads.each do |t| next if page_index > 0 && t[:is_sticky] scrape_thread(t) if no_new_posts if t[:is_sticky] next else return end end end end end private attr_reader :cookie, :fetcher, :parser, :no_new_posts def scrape_thread(t) @no_new_posts = false log t[:title] page = fetcher.thread(t) first_post = page.at_css('.postinfo:first-child') return false if first_post.nil? thread = DB.from(:threads).first(remote_id: t[:remote_id]) if thread.nil? log ' Inserting thread' t[:created_at] = parser.thread_created_at(first_post) thread = VLV::Thread.create(t.delete_if { |k| k == :is_sticky }) end scrape_posts(thread, page) end def scrape_posts(thread, page) posts = parser.posts(thread, page) last_post = posts.last if DB.from(:posts).first(remote_id: last_post[:remote_id]).nil? db_posts = VLV::Post.where(remote_id: posts.map { |p| p[:remote_id] }).all posts = posts.each_with_index.map do |post| if db_posts.detect { |db_post| db_post.remote_id == post[:remote_id] }.nil? post end end log " Inserting #{posts.size} posts" VLV::Post.multi_insert(posts.compact) else no_new_posts! log ' No new posts' end end def no_new_posts! @no_new_posts = true end def authenticate! @cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD']) raise "Error logging into VLV. Check your credentials." if @cookie.nil? @fetcher = Fetcher.new(cookie: cookie) @parser = Parser.new end def log(msg) puts msg if @log end end