You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scrape.rb 3.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. require 'dotenv/load'
  2. require 'net/http'
  3. require 'nokogiri'
  4. require 'sequel'
  5. require_relative '../db/connect'
  6. require_relative '../lib/auth'
  7. def fetch_page(page_number, cookie)
  8. url = URI("http://board.vivalavinyl.com/thread/list/#{page_number}")
  9. http = Net::HTTP.new(url.host, url.port)
  10. request = Net::HTTP::Get.new(url)
  11. request['cookie'] = cookie
  12. response = http.request(request)
  13. Nokogiri.HTML(response.body)
  14. end
  15. def fetch_thread(thread, cookie)
  16. sleep(0.5)
  17. url =
  18. URI(
  19. "http://board.vivalavinyl.com/thread/view/#{thread[:remote_id]}&ajax=true"
  20. )
  21. http = Net::HTTP.new(url.host, url.port)
  22. request = Net::HTTP::Get.new(url)
  23. request['cookie'] = cookie
  24. response = http.request(request)
  25. Nokogiri.HTML(response.body)
  26. end
  27. def parse_posts(thread, page)
  28. posts = Array.new
  29. page.css('.post').each do |_post|
  30. post = Hash.new
  31. post[:remote_id] = _post.at_css('ul.view')[:id].split('_')[1].to_i
  32. post[:creator] = _post.at_css('.memberlink').text.strip
  33. date, time =
  34. _post.at_css('.postinfo').text.split('posted this')[1].split('@')
  35. post[:created_at] = Time.parse("#{date} #{time}")
  36. post[:body] = _post.at_css('.postbody').children.map(&:to_html).join.strip
  37. post[:thread_id] = thread[:id]
  38. posts << post
  39. end
  40. posts
  41. end
  42. def insert_post(post, db)
  43. db.exec(
  44. 'insert into posts (body, timestamp, creator, thread_id, remote_id) values ($1, $2, $3, $4, $5)',
  45. [
  46. post[:body],
  47. post[:created_at].to_s,
  48. post[:creator],
  49. post[:thread_id].to_i,
  50. post[:id].to_i
  51. ]
  52. )
  53. end
  54. def parse_threads(page)
  55. threads = Array.new
  56. page.css('.even, .odd').each do |row|
  57. thread = Hash.new
  58. thread_link = row.at_css('.subject > a')
  59. next if thread_link.nil?
  60. thread[:remote_id] = thread_link['href'].split('/')[3]
  61. thread[:title] = thread_link.text
  62. creator = row.at_css('.memberlink').text.strip
  63. creator = creator[0..-2] if creator.match(/\+$/)
  64. thread[:creator] = creator
  65. threads << thread
  66. end
  67. threads
  68. end
  69. def scrape(first: 0, last: 0, log: false)
  70. cookie = login(ENV['VLV_USERNAME'], ENV['VLV_PASSWORD'])
  71. db = connect
  72. (first..last).each_with_index do |page_number, index|
  73. page = fetch_page(page_number, cookie)
  74. threads = parse_threads(page)
  75. threads.each do |t|
  76. is_sticky = t[:title].match(/^Sticky:/)
  77. next if index > 0 && is_sticky
  78. puts t[:title] if log
  79. page = fetch_thread(t, cookie)
  80. first_post = page.at_css('.postinfo:first-child')
  81. next if first_post.nil?
  82. post_info = first_post.text.split('posted this')
  83. date, time = post_info[1].split('@')
  84. t[:created_at] = Time.parse("#{date} #{time}")
  85. thread = db.from(:threads).first(remote_id: t[:remote_id])
  86. is_new_thread = thread.nil?
  87. if is_new_thread
  88. puts ' Inserting thread' if log
  89. id =
  90. db.from(:threads).insert(
  91. title: t[:title],
  92. creator: t[:creator],
  93. remote_id: t[:remote_id],
  94. created_at: t[:created_at]
  95. )
  96. t[:id] = id
  97. thread = t
  98. end
  99. posts = parse_posts(thread, page)
  100. last_post = posts.last
  101. unless db.from(:posts).first(remote_id: last_post[:remote_id]).nil?
  102. next if is_sticky
  103. break
  104. end
  105. posts_count = posts.size
  106. posts.each_with_index do |p, index|
  107. msg = " Inserting post #{index + 1}/#{posts_count}"
  108. print msg if log
  109. if is_new_thread || db.from(:posts).first(remote_id: p[:remote_id]).nil?
  110. db.from(:posts).insert(
  111. body: p[:body],
  112. created_at: p[:created_at],
  113. thread_id: p[:thread_id],
  114. creator: p[:creator],
  115. remote_id: p[:remote_id]
  116. )
  117. end
  118. print "\b" * msg.size unless index == posts_count - 1 if log
  119. end
  120. puts if log
  121. end
  122. end
  123. end