You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scrape.rb 3.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. require 'dotenv/load'
  2. require 'net/http'
  3. require 'nokogiri'
  4. require 'sequel'
  5. require_relative '../db/connect'
  6. require_relative '../lib/auth'
  7. def fetch_page(page_number, cookie)
  8. url = URI("http://board.vivalavinyl.com/thread/list/#{page_number}")
  9. http = Net::HTTP.new(url.host, url.port)
  10. request = Net::HTTP::Get.new(url)
  11. request['cookie'] = cookie
  12. response = http.request(request)
  13. Nokogiri.HTML(response.body)
  14. end
  15. def fetch_thread(thread, cookie)
  16. url =
  17. URI(
  18. "http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true"
  19. )
  20. http = Net::HTTP.new(url.host, url.port)
  21. request = Net::HTTP::Get.new(url)
  22. request['cookie'] = cookie
  23. response = http.request(request)
  24. Nokogiri.HTML(response.body)
  25. end
  26. def parse_posts(thread, page)
  27. posts = Array.new
  28. page.css('.post').each do |_post|
  29. post = Hash.new
  30. post[:remote_id] = _post.at_css('ul.view')[:id].split('_')[1].to_i
  31. post[:creator] = _post.at_css('.memberlink').text.strip
  32. date, time =
  33. _post.at_css('.postinfo').text.split('posted this')[1].split('@')
  34. post[:created_at] = Time.parse("#{date} #{time}")
  35. post[:body] = _post.at_css('.postbody').children.map(&:to_html).join.strip
  36. post[:thread_id] = thread[:id]
  37. posts << post
  38. end
  39. posts
  40. end
  41. def insert_post(post, db)
  42. db.exec(
  43. 'insert into posts (body, timestamp, creator, thread_id, remote_id) values ($1, $2, $3, $4, $5)',
  44. [
  45. post[:body],
  46. post[:created_at].to_s,
  47. post[:creator],
  48. post[:thread_id].to_i,
  49. post[:id].to_i
  50. ]
  51. )
  52. end
  53. def parse_threads(page)
  54. threads = Array.new
  55. page.css('.even, .odd').each do |row|
  56. thread = Hash.new
  57. thread_link = row.at_css('.subject > a')
  58. next if thread_link.nil?
  59. thread[:remote_id] = thread_link['href'].split('/')[3]
  60. thread[:title] = thread_link.text
  61. creator = row.at_css('.memberlink').text.strip
  62. creator = creator[0..-2] if creator.match(/\+$/)
  63. thread[:creator] = creator
  64. threads << thread
  65. end
  66. threads
  67. end
  68. def scrape(first: 0, last: 0)
  69. cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
  70. db = connect
  71. (first..last).each_with_index do |page_number, index|
  72. page = fetch_page(page_number, cookie)
  73. threads = parse_threads(page)
  74. threads.each do |t|
  75. sleep(1)
  76. is_sticky = t[:title].match(/^Sticky:/)
  77. next if index > 0 && is_sticky
  78. page = fetch_thread(t, cookie)
  79. first_post = page.at_css('.postinfo:first-child')
  80. next if first_post.nil?
  81. post_info = first_post.text.split('posted this')
  82. date, time = post_info[1].split('@')
  83. t[:created_at] = Time.parse("#{date} #{time}")
  84. thread = db.from(:threads).first(remote_id: t[:remote_id])
  85. is_new_thread = thread.nil?
  86. if is_new_thread
  87. puts ' Inserting thread'
  88. id =
  89. db.from(:threads).insert(
  90. title: t[:title],
  91. creator: t[:creator],
  92. remote_id: t[:remote_id],
  93. created_at: t[:created_at]
  94. )
  95. t[:id] = id
  96. thread = t
  97. end
  98. posts = parse_posts(thread, page)
  99. last_post = posts.last
  100. unless db.from(:posts).first(remote_id: last_post[:remote_id]).nil?
  101. next if is_sticky
  102. break
  103. end
  104. posts_count = posts.size
  105. posts.each_with_index do |p, index|
  106. msg = " Inserting post #{index + 1}/#{posts_count}"
  107. print msg
  108. if is_new_thread || db.from(:posts).first(remote_id: p[:remote_id]).nil?
  109. db.from(:posts).insert(
  110. body: p[:body],
  111. created_at: p[:created_at],
  112. thread_id: p[:thread_id],
  113. creator: p[:creator],
  114. remote_id: p[:remote_id]
  115. )
  116. end
  117. print "\b" * msg.size unless index == posts_count - 1
  118. end
  119. puts
  120. end
  121. end
  122. end