CloudArchive and CloudModel complete (enough)

- CloudModel is actually classed correctly
 - Minify JSON prior to uploading
 - Change archive update interval to 24h (30* ..20MB = 600/1000MB)
 - Store latest revision url for archive/model after persisting

TODO:
 - Add INITIAL_CORPUS_FILE config var
 - Look for Cloudinary API call to get filesize w/o DLing
 - Logic for jsonify(INTIIAL_CORPUS_FILE) v. using persisted archive
 - Add fetch/persist/etc methods to Ebooks::Archive/Model for fallback
 - Test it? Or just share it and pray.
This commit is contained in:
Joel McCoy
2015-02-11 23:41:26 -05:00
parent 4151957c21
commit 8a26f1108f
4 changed files with 90 additions and 52 deletions

View File

@@ -60,6 +60,6 @@
"TIMEOUT_SLEEP": "5", "TIMEOUT_SLEEP": "5",
"MAX_ERROR_RETRIES": "10", "MAX_ERROR_RETRIES": "10",
"UPDATE_FOLLOWS_INTERVAL": "90m", "UPDATE_FOLLOWS_INTERVAL": "90m",
"UPDATE_ARCHIVE_INTERVAL": "8h" "UPDATE_ARCHIVE_INTERVAL": "24h"
} }
} }

View File

@@ -64,22 +64,27 @@ module Ebooks::Boodoo
end end
end end
def jsonify(paths) def minify_tweets(tweets)
paths.each do |path| log "Minifying tweets..."
tweets.map do |tweet|
{id: tweet[:id], text: tweet[:text]}
end
end
def jsonify(path, new_name=nil)
name = File.basename(path).split('.')[0] name = File.basename(path).split('.')[0]
ext = path.split('.')[-1] ext = path.split('.')[-1]
new_path = name + ".json" new_name ||= name
new_path = new_name + ".json"
lines = [] lines = []
id = nil id = nil
if ext.downcase == "json"
log "Taking no action on JSON corpus at #{path}"
return
end
content = File.read(path, :encoding => 'utf-8') content = File.read(path, :encoding => 'utf-8')
if ext.downcase == "csv" #from twitter archive if ext.downcase == "json"
log "Minifying JSON corpus at #{path}"
lines = minify_tweets(JSON.parse(content, :symbolize_names=>true))
elsif ext.downcase == "csv" #from twitter archive
log "Reading CSV corpus from #{path}" log "Reading CSV corpus from #{path}"
content = CSV.parse(content) content = CSV.parse(content)
header = content.shift header = content.shift
@@ -96,11 +101,22 @@ module Ebooks::Boodoo
end end
end end
# BELOW IS FOR FILE-SYSTEM; NEED TO ALTER FOR CLOUDINARY/REQUEST?
File.open(new_path, 'w') do |f| File.open(new_path, 'w') do |f|
log "Writing #{lines.length} lines to #{new_path}" log "Writing #{lines.length} lines to #{new_path}"
f.write(JSON.pretty_generate(lines)) f.write(JSON.generate(lines))
end end
if has_cloud?
public_id = new_path
# log "Deleting JSON archive ~~~FROM THE CLOUD~~~"
# Cloudinary::Api.delete_resources(public_id, :resource_type=>:raw)
log "Uploading JSON archive ~~TO THE CLOUD~~"
res = Cloudinary::Uploader.upload(new_path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true)
log "Upload complete!"
res["url"]
else
log "Can't find ~~~THE CLOUD~~~, not uploading JSON archive."
nil
end end
end end
end end
@@ -120,9 +136,8 @@ class Ebooks::Boodoo::CloudArchive < Ebooks::Archive
@url = Cloudinary::Utils.cloudinary_url(@basename, :resource_type=>:raw) @url = Cloudinary::Utils.cloudinary_url(@basename, :resource_type=>:raw)
fetch! fetch!
parse! parse!
sync new_tweets = sync.class != IO
# save! # #sync automatically saves persist if new_tweets
persist
if @tweets.empty? if @tweets.empty?
log "New archive for @#{@username} at #{@url}" log "New archive for @#{@username} at #{@url}"
@@ -131,12 +146,21 @@ class Ebooks::Boodoo::CloudArchive < Ebooks::Archive
end end
end end
def minify!
@tweets = minify_tweets(@tweets)
end
def minify
minify_tweets(@tweets)
end
def persist(public_id=nil) def persist(public_id=nil)
public_id ||= @basename public_id ||= @basename
log "Deleting out-dated archive ~~~FROM THE CLOUD~~~" # log "Deleting out-dated archive ~~~FROM THE CLOUD~~~"
Cloudinary::Api.delete_resources(public_id, :resource_type=>:raw) # Cloudinary::Api.delete_resources(public_id, :resource_type=>:raw)
log "Uploading JSON archive ~~TO THE CLOUD~~" log "Uploading JSON archive ~~TO THE CLOUD~~"
res = Cloudinary::Uploader.upload(@path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true) res = Cloudinary::Uploader.upload(@path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true)
@url = res["url"]
log "Upload complete!" log "Upload complete!"
res res
end end
@@ -154,10 +178,11 @@ class Ebooks::Boodoo::CloudArchive < Ebooks::Archive
@tweets = parse(content) @tweets = parse(content)
end end
def save(path=nil) def save(path=nil, minify=true)
path ||= @path path ||= @path
output = minify ? JSON.generate(minify) : JSON.pretty_generate(@tweets)
File.open(path, 'w') do |f| File.open(path, 'w') do |f|
f.write(JSON.pretty_generate(@tweets)) f.write(output)
end end
end end
@@ -168,7 +193,7 @@ class Ebooks::Boodoo::CloudArchive < Ebooks::Archive
def fetch(url=nil) def fetch(url=nil)
url ||= @url url ||= @url
log "Fetching JSON archive ~~~FROM THE CLOUD~~~" log "Fetching JSON archive ~~~FROM THE CLOUD~~~"
content = Cloudinary::Downloader.download(url) content = Cloudinary::Downloader.download(url, :resource_type=>:raw)
if content.empty? if content.empty?
log "WARNING: JSON archive not found ~~~IN THE CLOUD~~~" log "WARNING: JSON archive not found ~~~IN THE CLOUD~~~"
nil nil
@@ -188,7 +213,7 @@ class Ebooks::Boodoo::CloudModel < Ebooks::Model
# @param content [String] # @param content [String]
# @return [Ebooks::Boodoo::CloudModel] # @return [Ebooks::Boodoo::CloudModel]
def self.parse(content) def self.parse(content)
model = Model.new model = CloudModel.new
model.instance_eval do model.instance_eval do
props = Marshal.load(content) props = Marshal.load(content)
@tokens = props[:tokens] @tokens = props[:tokens]
@@ -199,6 +224,12 @@ class Ebooks::Boodoo::CloudModel < Ebooks::Model
model model
end end
def self.from_json(content, is_file)
model = CloudModel.new
model.from_json(content, is_file)
model
end
def initialize(username, path=nil) def initialize(username, path=nil)
return Ebooks::Model.new unless has_cloud? return Ebooks::Model.new unless has_cloud?
@path = path || "corpus/#{username}.model" @path = path || "corpus/#{username}.model"
@@ -207,25 +238,32 @@ class Ebooks::Boodoo::CloudModel < Ebooks::Model
end end
super() super()
@basename = File.basename(@path) @basename = File.basename(@path)
@url = Cloudinary::Utils.cloudinary_url(@basename, :resource_type=>:raw)
end end
# Create a model from JSON string # Create a model from JSON string
# @content [String] Ebooks-style JSON twitter archive # @content [String/Array] Ebooks-style JSON twitter archive (pre-parsed)
# @return [Ebooks::Boodoo::CloudModel] # @return [Ebooks::Boodoo::CloudModel]
def from_json(content) def from_json(content, is_file=false)
log "Reading json corpus with length #{content.size}" content = File.read(content, :encoding=>'utf-8') if is_file
if content.respond_to?(:upcase)
lines = JSON.parse(content).map do |tweet| lines = JSON.parse(content).map do |tweet|
tweet['text'] tweet['text']
end end
else
lines = content
end
log "Reading json corpus with #{lines.size} lines"
consume_lines(lines) consume_lines(lines)
end end
def persist(public_id=nil) def persist(public_id=nil)
public_id ||= @basename public_id ||= @basename
log "Deleting old model ~~~FROM THE CLOUD~~~" # log "Deleting old model ~~~FROM THE CLOUD~~~"
Cloudinary::Api.delete_resources(@basename, :resource_type=>:raw) # Cloudinary::Api.delete_resources(@basename, :resource_type=>:raw)
log "Uploading bot model ~~TO THE CLOUD~~" log "Uploading bot model ~~TO THE CLOUD~~"
res = Cloudinary::Uploader.upload(@path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true) res = Cloudinary::Uploader.upload(@path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true)
@url = res["url"]
log "Upload complete!" log "Upload complete!"
res res
end end

View File

@@ -70,7 +70,7 @@ class BoodooBot
if can_run? if can_run?
log "This can run!" log "This can run!"
@archive = CloudArchive.new(original, archive_path, twitter) @archive = CloudArchive.new(original, archive_path, twitter)
@model = CloudModel.consume(@archive_path) @model = CloudModel.new(@original, @model_path).from_json(@archive_path, true)
else else
missing_fields.each {|missing| missing_fields.each {|missing|
log "Can't run without #{missing}" log "Can't run without #{missing}"

View File

@@ -17,4 +17,4 @@ TIMELINE_DELAY=10..600
TIMEOUT_SLEEP=5 TIMEOUT_SLEEP=5
MAX_ERROR_RETRIES=10 MAX_ERROR_RETRIES=10
UPDATE_FOLLOWS_INTERVAL=90m UPDATE_FOLLOWS_INTERVAL=90m
UPDATE_ARCHIVE_INTERVAL=8h UPDATE_ARCHIVE_INTERVAL=24h