mirror of
https://github.com/thewesker/ebooks_example.git
synced 2025-12-22 05:11:08 -05:00
CloudArchive and CloudModel complete (enough)
- CloudModel is actually classed correctly - Minify JSON prior to uploading - Change archive update interval to 24h (30* ..20MB = 600/1000MB) - Store latest revision url for archive/model after persisting TODO: - Add INITIAL_CORPUS_FILE config var - Look for Cloudinary API call to get filesize w/o DLing - Logic for jsonify(INTIIAL_CORPUS_FILE) v. using persisted archive - Add fetch/persist/etc methods to Ebooks::Archive/Model for fallback - Test it? Or just share it and pray.
This commit is contained in:
2
app.json
2
app.json
@@ -60,6 +60,6 @@
|
|||||||
"TIMEOUT_SLEEP": "5",
|
"TIMEOUT_SLEEP": "5",
|
||||||
"MAX_ERROR_RETRIES": "10",
|
"MAX_ERROR_RETRIES": "10",
|
||||||
"UPDATE_FOLLOWS_INTERVAL": "90m",
|
"UPDATE_FOLLOWS_INTERVAL": "90m",
|
||||||
"UPDATE_ARCHIVE_INTERVAL": "8h"
|
"UPDATE_ARCHIVE_INTERVAL": "24h"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
136
boodoo.rb
136
boodoo.rb
@@ -64,43 +64,59 @@ module Ebooks::Boodoo
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def jsonify(paths)
|
def minify_tweets(tweets)
|
||||||
paths.each do |path|
|
log "Minifying tweets..."
|
||||||
name = File.basename(path).split('.')[0]
|
tweets.map do |tweet|
|
||||||
ext = path.split('.')[-1]
|
{id: tweet[:id], text: tweet[:text]}
|
||||||
new_path = name + ".json"
|
end
|
||||||
lines = []
|
end
|
||||||
id = nil
|
|
||||||
|
|
||||||
if ext.downcase == "json"
|
def jsonify(path, new_name=nil)
|
||||||
log "Taking no action on JSON corpus at #{path}"
|
name = File.basename(path).split('.')[0]
|
||||||
return
|
ext = path.split('.')[-1]
|
||||||
|
new_name ||= name
|
||||||
|
new_path = new_name + ".json"
|
||||||
|
lines = []
|
||||||
|
id = nil
|
||||||
|
|
||||||
|
content = File.read(path, :encoding => 'utf-8')
|
||||||
|
|
||||||
|
if ext.downcase == "json"
|
||||||
|
log "Minifying JSON corpus at #{path}"
|
||||||
|
lines = minify_tweets(JSON.parse(content, :symbolize_names=>true))
|
||||||
|
elsif ext.downcase == "csv" #from twitter archive
|
||||||
|
log "Reading CSV corpus from #{path}"
|
||||||
|
content = CSV.parse(content)
|
||||||
|
header = content.shift
|
||||||
|
text_col = header.index('text')
|
||||||
|
id_col = header.index('tweet_id')
|
||||||
|
lines = content.map do |tweet|
|
||||||
|
id = tweet[id_col].empty? ? 0 : tweet[id_col]
|
||||||
|
{id: id, text: tweet[text_col]}
|
||||||
end
|
end
|
||||||
|
else
|
||||||
content = File.read(path, :encoding => 'utf-8')
|
log "Reading plaintext corpus from #{path}"
|
||||||
|
lines = content.split("\n").map do |line|
|
||||||
if ext.downcase == "csv" #from twitter archive
|
{id: 0, text: line}
|
||||||
log "Reading CSV corpus from #{path}"
|
|
||||||
content = CSV.parse(content)
|
|
||||||
header = content.shift
|
|
||||||
text_col = header.index('text')
|
|
||||||
id_col = header.index('tweet_id')
|
|
||||||
lines = content.map do |tweet|
|
|
||||||
id = tweet[id_col].empty? ? 0 : tweet[id_col]
|
|
||||||
{id: id, text: tweet[text_col]}
|
|
||||||
end
|
|
||||||
else
|
|
||||||
log "Reading plaintext corpus from #{path}"
|
|
||||||
lines = content.split("\n").map do |line|
|
|
||||||
{id: 0, text: line}
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
# BELOW IS FOR FILE-SYSTEM; NEED TO ALTER FOR CLOUDINARY/REQUEST?
|
File.open(new_path, 'w') do |f|
|
||||||
File.open(new_path, 'w') do |f|
|
log "Writing #{lines.length} lines to #{new_path}"
|
||||||
log "Writing #{lines.length} lines to #{new_path}"
|
f.write(JSON.generate(lines))
|
||||||
f.write(JSON.pretty_generate(lines))
|
end
|
||||||
end
|
|
||||||
|
if has_cloud?
|
||||||
|
public_id = new_path
|
||||||
|
# log "Deleting JSON archive ~~~FROM THE CLOUD~~~"
|
||||||
|
# Cloudinary::Api.delete_resources(public_id, :resource_type=>:raw)
|
||||||
|
log "Uploading JSON archive ~~TO THE CLOUD~~"
|
||||||
|
res = Cloudinary::Uploader.upload(new_path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true)
|
||||||
|
log "Upload complete!"
|
||||||
|
res["url"]
|
||||||
|
else
|
||||||
|
log "Can't find ~~~THE CLOUD~~~, not uploading JSON archive."
|
||||||
|
nil
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -120,9 +136,8 @@ class Ebooks::Boodoo::CloudArchive < Ebooks::Archive
|
|||||||
@url = Cloudinary::Utils.cloudinary_url(@basename, :resource_type=>:raw)
|
@url = Cloudinary::Utils.cloudinary_url(@basename, :resource_type=>:raw)
|
||||||
fetch!
|
fetch!
|
||||||
parse!
|
parse!
|
||||||
sync
|
new_tweets = sync.class != IO
|
||||||
# save! # #sync automatically saves
|
persist if new_tweets
|
||||||
persist
|
|
||||||
|
|
||||||
if @tweets.empty?
|
if @tweets.empty?
|
||||||
log "New archive for @#{@username} at #{@url}"
|
log "New archive for @#{@username} at #{@url}"
|
||||||
@@ -131,12 +146,21 @@ class Ebooks::Boodoo::CloudArchive < Ebooks::Archive
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def minify!
|
||||||
|
@tweets = minify_tweets(@tweets)
|
||||||
|
end
|
||||||
|
|
||||||
|
def minify
|
||||||
|
minify_tweets(@tweets)
|
||||||
|
end
|
||||||
|
|
||||||
def persist(public_id=nil)
|
def persist(public_id=nil)
|
||||||
public_id ||= @basename
|
public_id ||= @basename
|
||||||
log "Deleting out-dated archive ~~~FROM THE CLOUD~~~"
|
# log "Deleting out-dated archive ~~~FROM THE CLOUD~~~"
|
||||||
Cloudinary::Api.delete_resources(public_id, :resource_type=>:raw)
|
# Cloudinary::Api.delete_resources(public_id, :resource_type=>:raw)
|
||||||
log "Uploading JSON archive ~~TO THE CLOUD~~"
|
log "Uploading JSON archive ~~TO THE CLOUD~~"
|
||||||
res = Cloudinary::Uploader.upload(@path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true)
|
res = Cloudinary::Uploader.upload(@path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true)
|
||||||
|
@url = res["url"]
|
||||||
log "Upload complete!"
|
log "Upload complete!"
|
||||||
res
|
res
|
||||||
end
|
end
|
||||||
@@ -154,10 +178,11 @@ class Ebooks::Boodoo::CloudArchive < Ebooks::Archive
|
|||||||
@tweets = parse(content)
|
@tweets = parse(content)
|
||||||
end
|
end
|
||||||
|
|
||||||
def save(path=nil)
|
def save(path=nil, minify=true)
|
||||||
path ||= @path
|
path ||= @path
|
||||||
|
output = minify ? JSON.generate(minify) : JSON.pretty_generate(@tweets)
|
||||||
File.open(path, 'w') do |f|
|
File.open(path, 'w') do |f|
|
||||||
f.write(JSON.pretty_generate(@tweets))
|
f.write(output)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -168,7 +193,7 @@ class Ebooks::Boodoo::CloudArchive < Ebooks::Archive
|
|||||||
def fetch(url=nil)
|
def fetch(url=nil)
|
||||||
url ||= @url
|
url ||= @url
|
||||||
log "Fetching JSON archive ~~~FROM THE CLOUD~~~"
|
log "Fetching JSON archive ~~~FROM THE CLOUD~~~"
|
||||||
content = Cloudinary::Downloader.download(url)
|
content = Cloudinary::Downloader.download(url, :resource_type=>:raw)
|
||||||
if content.empty?
|
if content.empty?
|
||||||
log "WARNING: JSON archive not found ~~~IN THE CLOUD~~~"
|
log "WARNING: JSON archive not found ~~~IN THE CLOUD~~~"
|
||||||
nil
|
nil
|
||||||
@@ -188,7 +213,7 @@ class Ebooks::Boodoo::CloudModel < Ebooks::Model
|
|||||||
# @param content [String]
|
# @param content [String]
|
||||||
# @return [Ebooks::Boodoo::CloudModel]
|
# @return [Ebooks::Boodoo::CloudModel]
|
||||||
def self.parse(content)
|
def self.parse(content)
|
||||||
model = Model.new
|
model = CloudModel.new
|
||||||
model.instance_eval do
|
model.instance_eval do
|
||||||
props = Marshal.load(content)
|
props = Marshal.load(content)
|
||||||
@tokens = props[:tokens]
|
@tokens = props[:tokens]
|
||||||
@@ -199,6 +224,12 @@ class Ebooks::Boodoo::CloudModel < Ebooks::Model
|
|||||||
model
|
model
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def self.from_json(content, is_file)
|
||||||
|
model = CloudModel.new
|
||||||
|
model.from_json(content, is_file)
|
||||||
|
model
|
||||||
|
end
|
||||||
|
|
||||||
def initialize(username, path=nil)
|
def initialize(username, path=nil)
|
||||||
return Ebooks::Model.new unless has_cloud?
|
return Ebooks::Model.new unless has_cloud?
|
||||||
@path = path || "corpus/#{username}.model"
|
@path = path || "corpus/#{username}.model"
|
||||||
@@ -207,25 +238,32 @@ class Ebooks::Boodoo::CloudModel < Ebooks::Model
|
|||||||
end
|
end
|
||||||
super()
|
super()
|
||||||
@basename = File.basename(@path)
|
@basename = File.basename(@path)
|
||||||
|
@url = Cloudinary::Utils.cloudinary_url(@basename, :resource_type=>:raw)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Create a model from JSON string
|
# Create a model from JSON string
|
||||||
# @content [String] Ebooks-style JSON twitter archive
|
# @content [String/Array] Ebooks-style JSON twitter archive (pre-parsed)
|
||||||
# @return [Ebooks::Boodoo::CloudModel]
|
# @return [Ebooks::Boodoo::CloudModel]
|
||||||
def from_json(content)
|
def from_json(content, is_file=false)
|
||||||
log "Reading json corpus with length #{content.size}"
|
content = File.read(content, :encoding=>'utf-8') if is_file
|
||||||
lines = JSON.parse(content).map do |tweet|
|
if content.respond_to?(:upcase)
|
||||||
tweet['text']
|
lines = JSON.parse(content).map do |tweet|
|
||||||
|
tweet['text']
|
||||||
|
end
|
||||||
|
else
|
||||||
|
lines = content
|
||||||
end
|
end
|
||||||
|
log "Reading json corpus with #{lines.size} lines"
|
||||||
consume_lines(lines)
|
consume_lines(lines)
|
||||||
end
|
end
|
||||||
|
|
||||||
def persist(public_id=nil)
|
def persist(public_id=nil)
|
||||||
public_id ||= @basename
|
public_id ||= @basename
|
||||||
log "Deleting old model ~~~FROM THE CLOUD~~~"
|
# log "Deleting old model ~~~FROM THE CLOUD~~~"
|
||||||
Cloudinary::Api.delete_resources(@basename, :resource_type=>:raw)
|
# Cloudinary::Api.delete_resources(@basename, :resource_type=>:raw)
|
||||||
log "Uploading bot model ~~TO THE CLOUD~~"
|
log "Uploading bot model ~~TO THE CLOUD~~"
|
||||||
res = Cloudinary::Uploader.upload(@path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true)
|
res = Cloudinary::Uploader.upload(@path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true)
|
||||||
|
@url = res["url"]
|
||||||
log "Upload complete!"
|
log "Upload complete!"
|
||||||
res
|
res
|
||||||
end
|
end
|
||||||
|
|||||||
2
bots.rb
2
bots.rb
@@ -70,7 +70,7 @@ class BoodooBot
|
|||||||
if can_run?
|
if can_run?
|
||||||
log "This can run!"
|
log "This can run!"
|
||||||
@archive = CloudArchive.new(original, archive_path, twitter)
|
@archive = CloudArchive.new(original, archive_path, twitter)
|
||||||
@model = CloudModel.consume(@archive_path)
|
@model = CloudModel.new(@original, @model_path).from_json(@archive_path, true)
|
||||||
else
|
else
|
||||||
missing_fields.each {|missing|
|
missing_fields.each {|missing|
|
||||||
log "Can't run without #{missing}"
|
log "Can't run without #{missing}"
|
||||||
|
|||||||
@@ -17,4 +17,4 @@ TIMELINE_DELAY=10..600
|
|||||||
TIMEOUT_SLEEP=5
|
TIMEOUT_SLEEP=5
|
||||||
MAX_ERROR_RETRIES=10
|
MAX_ERROR_RETRIES=10
|
||||||
UPDATE_FOLLOWS_INTERVAL=90m
|
UPDATE_FOLLOWS_INTERVAL=90m
|
||||||
UPDATE_ARCHIVE_INTERVAL=8h
|
UPDATE_ARCHIVE_INTERVAL=24h
|
||||||
|
|||||||
Reference in New Issue
Block a user