You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scrape.rb 3.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. require 'dotenv/load'
  2. require 'net/http'
  3. require 'nokogiri'
  4. require 'sequel'
  5. require_relative '../lib/auth'
  6. def fetch_page(page_number, cookie)
  7. url = URI("http://board.vivalavinyl.com/thread/list/#{page_number}")
  8. http = Net::HTTP.new(url.host, url.port)
  9. request = Net::HTTP::Get.new(url)
  10. request['cookie'] = cookie
  11. response = http.request(request)
  12. Nokogiri.HTML(response.body)
  13. end
  14. def fetch_thread(thread, cookie)
  15. url =
  16. URI(
  17. "http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true"
  18. )
  19. http = Net::HTTP.new(url.host, url.port)
  20. request = Net::HTTP::Get.new(url)
  21. request['cookie'] = cookie
  22. response = http.request(request)
  23. Nokogiri.HTML(response.body)
  24. end
  25. def parse_posts(thread, page)
  26. posts = Array.new
  27. page.css('.post').each do |_post|
  28. post = Hash.new
  29. post[:remote_id] = _post.at_css('ul.view')[:id].split('_')[1].to_i
  30. post[:creator] = _post.at_css('.memberlink').text.strip
  31. date, time =
  32. _post.at_css('.postinfo').text.split('posted this')[1].split('@')
  33. post[:created_at] = Time.parse("#{date} #{time}")
  34. post[:body] = _post.at_css('.postbody').children.map(&:to_html).join.strip
  35. post[:thread_id] = thread[:id]
  36. posts << post
  37. end
  38. posts
  39. end
  40. def insert_post(post, db)
  41. db.exec(
  42. 'insert into posts (body, timestamp, creator, thread_id, remote_id) values ($1, $2, $3, $4, $5)',
  43. [
  44. post[:body],
  45. post[:created_at].to_s,
  46. post[:creator],
  47. post[:thread_id].to_i,
  48. post[:id].to_i
  49. ]
  50. )
  51. end
  52. def parse_threads(page)
  53. threads = Array.new
  54. page.css('.even, .odd').each do |row|
  55. thread = Hash.new
  56. thread_link = row.at_css('.subject > a')
  57. next if thread_link.nil?
  58. thread[:remote_id] = thread_link['href'].split('/')[3]
  59. thread[:title] = thread_link.text
  60. creator = row.at_css('.memberlink').text.strip
  61. creator = creator[0..-2] if creator.match(/\+$/)
  62. thread[:creator] = creator
  63. threads << thread
  64. end
  65. threads
  66. end
  67. def scrape(first: 0, last: 0)
  68. cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
  69. db = Sequel.connect(adapter: :postgres, database: ENV['DB_DATABASE'])
  70. (first..last).each do |page_number|
  71. page = fetch_page(page_number, cookie)
  72. threads = parse_threads(page)
  73. threads.each do |t|
  74. puts t[:title]
  75. page = fetch_thread(t, cookie)
  76. first_post = page.at_css('.postinfo:first-child')
  77. next if first_post.nil?
  78. post_info = first_post.text.split('posted this')
  79. date, time = post_info[1].split('@')
  80. t[:created_at] = Time.parse("#{date} #{time}")
  81. thread = db.from(:threads).first(remote_id: t[:remote_id])
  82. is_new_thread = thread.nil?
  83. if is_new_thread
  84. puts ' Inserting thread'
  85. id =
  86. db.from(:threads).insert(
  87. title: t[:title],
  88. creator: t[:creator],
  89. remote_id: t[:remote_id],
  90. created_at: t[:created_at]
  91. )
  92. t[:id] = id
  93. thread = t
  94. end
  95. posts = parse_posts(thread, page)
  96. last_post = posts.last
  97. unless db.from(:posts).first(remote_id: last_post[:remote_id]).nil?
  98. puts ' Up to date, skipping'
  99. next
  100. end
  101. posts_count = posts.size
  102. posts.each_with_index do |p, index|
  103. msg = " Inserting post #{index + 1}/#{posts_count}"
  104. print msg
  105. if is_new_thread || db.from(:posts).first(remote_id: p[:remote_id]).nil?
  106. db.from(:posts).insert(
  107. body: p[:body],
  108. created_at: p[:created_at],
  109. thread_id: p[:thread_id],
  110. creator: p[:creator],
  111. remote_id: p[:remote_id]
  112. )
  113. end
  114. print "\b" * msg.size unless index == posts_count - 1
  115. end
  116. puts
  117. end
  118. end
  119. end