mirror of
https://github.com/thewesker/twitter_ebooks.git
synced 2025-12-23 13:51:09 -05:00
Allow consumption of json archives
This commit is contained in:
@@ -17,14 +17,22 @@ module Ebooks
|
||||
Marshal.load(File.read(path))
|
||||
end
|
||||
|
||||
def consume(txtpath)
|
||||
# Record hash of source file so we know to update later
|
||||
@hash = Digest::MD5.hexdigest(File.read(txtpath))
|
||||
def consume(path)
|
||||
content = File.read(path)
|
||||
@hash = Digest::MD5.hexdigest(content)
|
||||
|
||||
if path.split('.')[-1] == "json"
|
||||
log "Reading json corpus from #{path}"
|
||||
lines = JSON.parse(content, symbolize_names: true).map do |tweet|
|
||||
tweet[:text]
|
||||
end
|
||||
else
|
||||
log "Reading plaintext corpus from #{path}"
|
||||
lines = content.split("\n")
|
||||
end
|
||||
|
||||
text = File.read(txtpath)
|
||||
log "Removing commented lines and sorting mentions"
|
||||
|
||||
lines = text.split("\n")
|
||||
keeping = []
|
||||
mentions = []
|
||||
lines.each do |l|
|
||||
|
||||
Reference in New Issue
Block a user