CloudArchive and CloudModel complete (enough)

- CloudModel is actually classed correctly - Minify JSON prior to uploading - Change archive update interval to 24h (30* ..20MB = 600/1000MB) - Store latest revision url for archive/model after persisting TODO: - Add INITIAL_CORPUS_FILE config var - Look for Cloudinary API call to get filesize w/o DLing - Logic for jsonify(INTIIAL_CORPUS_FILE) v. using persisted archive - Add fetch/persist/etc methods to Ebooks::Archive/Model for fallback - Test it? Or just share it and pray.
2026-02-05 12:05:14 -05:00 · 2015-02-11 23:41:26 -05:00
parent 4151957c21
commit 8a26f1108f
4 changed files with 90 additions and 52 deletions
--- a/app.json
+++ b/app.json
@@ -60,6 +60,6 @@
    "TIMEOUT_SLEEP": "5",
    "MAX_ERROR_RETRIES": "10",
    "UPDATE_FOLLOWS_INTERVAL": "90m",
-    "UPDATE_ARCHIVE_INTERVAL": "8h"
+    "UPDATE_ARCHIVE_INTERVAL": "24h"
  }
 }
--- a/boodoo.rb
+++ b/boodoo.rb
@@ -64,43 +64,59 @@ module Ebooks::Boodoo
    end
  end
-  def jsonify(paths)
+  def minify_tweets(tweets)
-    paths.each do |path|
+    log "Minifying tweets..."
-      name = File.basename(path).split('.')[0]
+    tweets.map do |tweet|
-      ext = path.split('.')[-1]
+      {id: tweet[:id], text: tweet[:text]}
-      new_path = name + ".json"
+    end
-      lines = []
+  end
      id = nil
-      if ext.downcase == "json"
+  def jsonify(path, new_name=nil)
-        log "Taking no action on JSON corpus at #{path}"
+    name = File.basename(path).split('.')[0]
-        return
+    ext = path.split('.')[-1]
    new_name ||= name
    new_path = new_name + ".json"
    lines = []
    id = nil
    content = File.read(path, :encoding => 'utf-8')
    if ext.downcase == "json"
      log "Minifying JSON corpus at #{path}"
      lines = minify_tweets(JSON.parse(content, :symbolize_names=>true))
    elsif ext.downcase == "csv" #from twitter archive
      log "Reading CSV corpus from #{path}"
      content = CSV.parse(content)
      header = content.shift
      text_col = header.index('text')
      id_col = header.index('tweet_id')
      lines = content.map do |tweet|
        id = tweet[id_col].empty? ? 0 : tweet[id_col]
        {id: id, text: tweet[text_col]}
      end
-
+    else
-      content = File.read(path, :encoding => 'utf-8')
+      log "Reading plaintext corpus from #{path}"
-
+      lines = content.split("\n").map do |line|
-      if ext.downcase == "csv" #from twitter archive
+        {id: 0, text: line}
        log "Reading CSV corpus from #{path}"
        content = CSV.parse(content)
        header = content.shift
        text_col = header.index('text')
        id_col = header.index('tweet_id')
        lines = content.map do |tweet|
          id = tweet[id_col].empty? ? 0 : tweet[id_col]
          {id: id, text: tweet[text_col]}
        end
      else
        log "Reading plaintext corpus from #{path}"
        lines = content.split("\n").map do |line|
          {id: 0, text: line}
        end
      end
    end
-      # BELOW IS FOR FILE-SYSTEM; NEED TO ALTER FOR CLOUDINARY/REQUEST?
+    File.open(new_path, 'w') do |f|
-      File.open(new_path, 'w') do |f|
+      log "Writing #{lines.length} lines to #{new_path}"
-        log "Writing #{lines.length} lines to #{new_path}"
+      f.write(JSON.generate(lines))
-        f.write(JSON.pretty_generate(lines))
+    end
-      end
+
    if has_cloud?
      public_id = new_path
      # log "Deleting JSON archive ~~~FROM THE CLOUD~~~"
      # Cloudinary::Api.delete_resources(public_id, :resource_type=>:raw)
      log "Uploading JSON archive ~~TO THE CLOUD~~"
      res = Cloudinary::Uploader.upload(new_path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true)
      log "Upload complete!"
      res["url"]
    else
      log "Can't find ~~~THE CLOUD~~~, not uploading JSON archive."
      nil
    end
  end
 end
@@ -120,9 +136,8 @@ class Ebooks::Boodoo::CloudArchive < Ebooks::Archive
    @url = Cloudinary::Utils.cloudinary_url(@basename, :resource_type=>:raw)
    fetch!
    parse!
-    sync
+    new_tweets = sync.class != IO
-    # save! # #sync automatically saves
+    persist if new_tweets
    persist
    if @tweets.empty?
      log "New archive for @#{@username} at #{@url}"
@@ -131,12 +146,21 @@ class Ebooks::Boodoo::CloudArchive < Ebooks::Archive
    end
  end
  def minify!
    @tweets = minify_tweets(@tweets)
  end
  def minify
    minify_tweets(@tweets)
  end
  def persist(public_id=nil)
    public_id ||= @basename
-    log "Deleting out-dated archive ~~~FROM THE CLOUD~~~"
+    # log "Deleting out-dated archive ~~~FROM THE CLOUD~~~"
-    Cloudinary::Api.delete_resources(public_id, :resource_type=>:raw)
+    # Cloudinary::Api.delete_resources(public_id, :resource_type=>:raw)
    log "Uploading JSON archive ~~TO THE CLOUD~~"
    res = Cloudinary::Uploader.upload(@path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true)
    @url = res["url"]
    log "Upload complete!"
    res
  end
@@ -154,10 +178,11 @@ class Ebooks::Boodoo::CloudArchive < Ebooks::Archive
    @tweets = parse(content)
  end
-  def save(path=nil)
+  def save(path=nil, minify=true)
    path ||= @path
    output = minify ? JSON.generate(minify) : JSON.pretty_generate(@tweets)
    File.open(path, 'w') do |f|
-      f.write(JSON.pretty_generate(@tweets))
+      f.write(output)
    end
  end
@@ -168,7 +193,7 @@ class Ebooks::Boodoo::CloudArchive < Ebooks::Archive
  def fetch(url=nil)
    url ||= @url
    log "Fetching JSON archive ~~~FROM THE CLOUD~~~"
-    content = Cloudinary::Downloader.download(url)
+    content = Cloudinary::Downloader.download(url, :resource_type=>:raw)
    if content.empty?
      log "WARNING: JSON archive not found ~~~IN THE CLOUD~~~"
      nil
@@ -188,7 +213,7 @@ class Ebooks::Boodoo::CloudModel < Ebooks::Model
  # @param content [String]
  # @return [Ebooks::Boodoo::CloudModel]
  def self.parse(content)
-    model = Model.new
+    model = CloudModel.new
    model.instance_eval do
      props = Marshal.load(content)
      @tokens = props[:tokens]
@@ -199,6 +224,12 @@ class Ebooks::Boodoo::CloudModel < Ebooks::Model
    model
  end
  def self.from_json(content, is_file)
    model = CloudModel.new
    model.from_json(content, is_file)
    model
  end
  def initialize(username, path=nil)
    return Ebooks::Model.new unless has_cloud?
    @path = path || "corpus/#{username}.model"
@@ -207,25 +238,32 @@ class Ebooks::Boodoo::CloudModel < Ebooks::Model
    end
    super()
    @basename = File.basename(@path)
    @url = Cloudinary::Utils.cloudinary_url(@basename, :resource_type=>:raw)
  end
  # Create a model from JSON string
-  # @content [String] Ebooks-style JSON twitter archive
+  # @content [String/Array] Ebooks-style JSON twitter archive (pre-parsed)
  # @return [Ebooks::Boodoo::CloudModel]
-  def from_json(content)
+  def from_json(content, is_file=false)
-    log "Reading json corpus with length #{content.size}"
+    content = File.read(content, :encoding=>'utf-8') if is_file
-    lines = JSON.parse(content).map do |tweet|
+    if content.respond_to?(:upcase)
-      tweet['text']
+      lines = JSON.parse(content).map do |tweet|
        tweet['text']
      end
    else
      lines = content
    end
    log "Reading json corpus with #{lines.size} lines"
    consume_lines(lines)
  end
  def persist(public_id=nil)
    public_id ||= @basename
-    log "Deleting old model ~~~FROM THE CLOUD~~~"
+    # log "Deleting old model ~~~FROM THE CLOUD~~~"
-    Cloudinary::Api.delete_resources(@basename, :resource_type=>:raw)
+    # Cloudinary::Api.delete_resources(@basename, :resource_type=>:raw)
    log "Uploading bot model ~~TO THE CLOUD~~"
    res = Cloudinary::Uploader.upload(@path, :resource_type=>:raw, :public_id=>public_id, :invalidate=>true)
    @url = res["url"]
    log "Upload complete!"
    res
  end
--- a/bots.rb
+++ b/bots.rb
@@ -70,7 +70,7 @@ class BoodooBot
    if can_run?
      log "This can run!"
      @archive = CloudArchive.new(original, archive_path, twitter)
-      @model = CloudModel.consume(@archive_path)
+      @model = CloudModel.new(@original, @model_path).from_json(@archive_path, true)
    else
      missing_fields.each {|missing|
        log "Can't run without #{missing}"
--- a/defaults.env
+++ b/defaults.env
@@ -17,4 +17,4 @@ TIMELINE_DELAY=10..600
 TIMEOUT_SLEEP=5
 MAX_ERROR_RETRIES=10
 UPDATE_FOLLOWS_INTERVAL=90m
-UPDATE_ARCHIVE_INTERVAL=8h
+UPDATE_ARCHIVE_INTERVAL=24h