require 'dotenv/load' require 'net/http' require 'nokogiri' require 'sequel' require_relative '../db/connect' require_relative '../lib/auth' def fetch_page(page_number, cookie) url = URI("http://board.vivalavinyl.com/thread/list/#{page_number}") http = Net::HTTP.new(url.host, url.port) request = Net::HTTP::Get.new(url) request['cookie'] = cookie response = http.request(request) Nokogiri.HTML(response.body) end def fetch_thread(thread, cookie) url = URI( "http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true" ) http = Net::HTTP.new(url.host, url.port) request = Net::HTTP::Get.new(url) request['cookie'] = cookie response = http.request(request) Nokogiri.HTML(response.body) end def parse_posts(thread, page) posts = Array.new page.css('.post').each do |_post| post = Hash.new post[:remote_id] = _post.at_css('ul.view')[:id].split('_')[1].to_i post[:creator] = _post.at_css('.memberlink').text.strip date, time = _post.at_css('.postinfo').text.split('posted this')[1].split('@') post[:created_at] = Time.parse("#{date} #{time}") post[:body] = _post.at_css('.postbody').children.map(&:to_html).join.strip post[:thread_id] = thread[:id] posts << post end posts end def insert_post(post, db) db.exec( 'insert into posts (body, timestamp, creator, thread_id, remote_id) values ($1, $2, $3, $4, $5)', [ post[:body], post[:created_at].to_s, post[:creator], post[:thread_id].to_i, post[:id].to_i ] ) end def parse_threads(page) threads = Array.new page.css('.even, .odd').each do |row| thread = Hash.new thread_link = row.at_css('.subject > a') next if thread_link.nil? thread[:remote_id] = thread_link['href'].split('/')[3] thread[:title] = thread_link.text creator = row.at_css('.memberlink').text.strip creator = creator[0..-2] if creator.match(/\+$/) thread[:creator] = creator threads << thread end threads end def scrape(first: 0, last: 0) cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD']) db = connect (first..last).each do |page_number| page = fetch_page(page_number, cookie) threads = parse_threads(page) threads.each do |t| puts t[:title] sleep(1) page = fetch_thread(t, cookie) first_post = page.at_css('.postinfo:first-child') next if first_post.nil? post_info = first_post.text.split('posted this') date, time = post_info[1].split('@') t[:created_at] = Time.parse("#{date} #{time}") thread = db.from(:threads).first(remote_id: t[:remote_id]) is_new_thread = thread.nil? if is_new_thread puts ' Inserting thread' id = db.from(:threads).insert( title: t[:title], creator: t[:creator], remote_id: t[:remote_id], created_at: t[:created_at] ) t[:id] = id thread = t end posts = parse_posts(thread, page) last_post = posts.last unless db.from(:posts).first(remote_id: last_post[:remote_id]).nil? break end posts_count = posts.size posts.each_with_index do |p, index| msg = " Inserting post #{index + 1}/#{posts_count}" print msg if is_new_thread || db.from(:posts).first(remote_id: p[:remote_id]).nil? db.from(:posts).insert( body: p[:body], created_at: p[:created_at], thread_id: p[:thread_id], creator: p[:creator], remote_id: p[:remote_id] ) end print "\b" * msg.size unless index == posts_count - 1 end puts end end end