You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scrape.rb 3.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. require 'dotenv/load'
  2. require 'net/http'
  3. require 'nokogiri'
  4. require 'sequel'
  5. require_relative '../db/connect'
  6. require_relative '../lib/auth'
  7. def fetch_page(page_number, cookie)
  8. url = URI("http://board.vivalavinyl.com/thread/list/#{page_number}")
  9. http = Net::HTTP.new(url.host, url.port)
  10. request = Net::HTTP::Get.new(url)
  11. request['cookie'] = cookie
  12. response = http.request(request)
  13. Nokogiri.HTML(response.body)
  14. end
  15. def fetch_thread(thread, cookie)
  16. url =
  17. URI(
  18. "http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true"
  19. )
  20. http = Net::HTTP.new(url.host, url.port)
  21. request = Net::HTTP::Get.new(url)
  22. request['cookie'] = cookie
  23. response = http.request(request)
  24. Nokogiri.HTML(response.body)
  25. end
  26. def parse_posts(thread, page)
  27. posts = Array.new
  28. page.css('.post').each do |_post|
  29. post = Hash.new
  30. post[:remote_id] = _post.at_css('ul.view')[:id].split('_')[1].to_i
  31. post[:creator] = _post.at_css('.memberlink').text.strip
  32. date, time =
  33. _post.at_css('.postinfo').text.split('posted this')[1].split('@')
  34. post[:created_at] = Time.parse("#{date} #{time}")
  35. post[:body] = _post.at_css('.postbody').children.map(&:to_html).join.strip
  36. post[:thread_id] = thread[:id]
  37. posts << post
  38. end
  39. posts
  40. end
  41. def insert_post(post, db)
  42. db.exec(
  43. 'insert into posts (body, timestamp, creator, thread_id, remote_id) values ($1, $2, $3, $4, $5)',
  44. [
  45. post[:body],
  46. post[:created_at].to_s,
  47. post[:creator],
  48. post[:thread_id].to_i,
  49. post[:id].to_i
  50. ]
  51. )
  52. end
  53. def parse_threads(page)
  54. threads = Array.new
  55. page.css('.even, .odd').each do |row|
  56. thread = Hash.new
  57. thread_link = row.at_css('.subject > a')
  58. next if thread_link.nil?
  59. thread[:remote_id] = thread_link['href'].split('/')[3]
  60. thread[:title] = thread_link.text
  61. creator = row.at_css('.memberlink').text.strip
  62. creator = creator[0..-2] if creator.match(/\+$/)
  63. thread[:creator] = creator
  64. threads << thread
  65. end
  66. threads
  67. end
  68. def scrape(first: 0, last: 0)
  69. cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
  70. db = connect
  71. (first..last).each do |page_number|
  72. page = fetch_page(page_number, cookie)
  73. threads = parse_threads(page)
  74. threads.each do |t|
  75. puts t[:title]
  76. sleep(1)
  77. page = fetch_thread(t, cookie)
  78. first_post = page.at_css('.postinfo:first-child')
  79. next if first_post.nil?
  80. post_info = first_post.text.split('posted this')
  81. date, time = post_info[1].split('@')
  82. t[:created_at] = Time.parse("#{date} #{time}")
  83. thread = db.from(:threads).first(remote_id: t[:remote_id])
  84. is_new_thread = thread.nil?
  85. if is_new_thread
  86. puts ' Inserting thread'
  87. id =
  88. db.from(:threads).insert(
  89. title: t[:title],
  90. creator: t[:creator],
  91. remote_id: t[:remote_id],
  92. created_at: t[:created_at]
  93. )
  94. t[:id] = id
  95. thread = t
  96. end
  97. posts = parse_posts(thread, page)
  98. last_post = posts.last
  99. unless db.from(:posts).first(remote_id: last_post[:remote_id]).nil?
  100. break
  101. end
  102. posts_count = posts.size
  103. posts.each_with_index do |p, index|
  104. msg = " Inserting post #{index + 1}/#{posts_count}"
  105. print msg
  106. if is_new_thread || db.from(:posts).first(remote_id: p[:remote_id]).nil?
  107. db.from(:posts).insert(
  108. body: p[:body],
  109. created_at: p[:created_at],
  110. thread_id: p[:thread_id],
  111. creator: p[:creator],
  112. remote_id: p[:remote_id]
  113. )
  114. end
  115. print "\b" * msg.size unless index == posts_count - 1
  116. end
  117. puts
  118. end
  119. end
  120. end