123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142 |
- require 'dotenv/load'
- require 'net/http'
- require 'nokogiri'
- require 'sequel'
-
- require_relative '../db/connect'
- require_relative '../lib/auth'
-
- def fetch_page(page_number, cookie)
- url = URI("http://board.vivalavinyl.com/thread/list/#{page_number}")
- http = Net::HTTP.new(url.host, url.port)
- request = Net::HTTP::Get.new(url)
- request['cookie'] = cookie
- response = http.request(request)
- Nokogiri.HTML(response.body)
- end
-
- def fetch_thread(thread, cookie)
- sleep(0.5)
- url =
- URI(
- "http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true"
- )
- http = Net::HTTP.new(url.host, url.port)
- request = Net::HTTP::Get.new(url)
- request['cookie'] = cookie
- response = http.request(request)
- Nokogiri.HTML(response.body)
- end
-
- def parse_posts(thread, page)
- posts = Array.new
-
- page.css('.post').each do |_post|
- post = Hash.new
- post[:remote_id] = _post.at_css('ul.view')[:id].split('_')[1].to_i
- post[:creator] = _post.at_css('.memberlink').text.strip
- date, time =
- _post.at_css('.postinfo').text.split('posted this')[1].split('@')
- post[:created_at] = Time.parse("#{date} #{time}")
- post[:body] = _post.at_css('.postbody').children.map(&:to_html).join.strip
- post[:thread_id] = thread[:id]
- posts << post
- end
-
- posts
- end
-
- def insert_post(post, db)
- db.exec(
- 'insert into posts (body, timestamp, creator, thread_id, remote_id) values ($1, $2, $3, $4, $5)',
- [
- post[:body],
- post[:created_at].to_s,
- post[:creator],
- post[:thread_id].to_i,
- post[:id].to_i
- ]
- )
- end
-
- def parse_threads(page)
- threads = Array.new
- page.css('.even, .odd').each do |row|
- thread = Hash.new
- thread_link = row.at_css('.subject > a')
- next if thread_link.nil?
- thread[:remote_id] = thread_link['href'].split('/')[3]
- thread[:title] = thread_link.text
- creator = row.at_css('.memberlink').text.strip
- creator = creator[0..-2] if creator.match(/\+$/)
- thread[:creator] = creator
- threads << thread
- end
- threads
- end
-
- def scrape(first: 0, last: 0, log: false)
- cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
- db = connect
-
- (first..last).each_with_index do |page_number, index|
- page = fetch_page(page_number, cookie)
- threads = parse_threads(page)
- threads.each do |t|
- is_sticky = t[:title].match(/^Sticky:/)
- next if index > 0 && is_sticky
-
- puts t[:title] if log
-
- page = fetch_thread(t, cookie)
- first_post = page.at_css('.postinfo:first-child')
-
- next if first_post.nil?
-
- post_info = first_post.text.split('posted this')
- date, time = post_info[1].split('@')
- t[:created_at] = Time.parse("#{date} #{time}")
-
- thread = db.from(:threads).first(remote_id: t[:remote_id])
- is_new_thread = thread.nil?
- if is_new_thread
- puts ' Inserting thread' if log
- id =
- db.from(:threads).insert(
- title: t[:title],
- creator: t[:creator],
- remote_id: t[:remote_id],
- created_at: t[:created_at]
- )
- t[:id] = id
- thread = t
- end
-
- posts = parse_posts(thread, page)
-
- last_post = posts.last
- unless db.from(:posts).first(remote_id: last_post[:remote_id]).nil?
- next if is_sticky
- break
- end
-
- posts_count = posts.size
- posts.each_with_index do |p, index|
- msg = " Inserting post #{index + 1}/#{posts_count}"
- print msg if log
- if is_new_thread || db.from(:posts).first(remote_id: p[:remote_id]).nil?
- db.from(:posts).insert(
- body: p[:body],
- created_at: p[:created_at],
- thread_id: p[:thread_id],
- creator: p[:creator],
- remote_id: p[:remote_id]
- )
- end
- print "\b" * msg.size unless index == posts_count - 1 if log
- end
-
- puts if log
- end
- end
- end
|