mirror of
https://github.com/thewesker/twitter_ebooks.git
synced 2025-12-23 13:51:09 -05:00
On second thought, we can't use a cache system
Simply because the corpuses are too darn big to keep around
This commit is contained in:
@@ -4,33 +4,14 @@
|
|||||||
require 'json'
|
require 'json'
|
||||||
require 'set'
|
require 'set'
|
||||||
require 'digest/md5'
|
require 'digest/md5'
|
||||||
require 'fileutils'
|
|
||||||
require 'csv'
|
require 'csv'
|
||||||
|
|
||||||
module Ebooks
|
module Ebooks
|
||||||
class Model
|
class Model
|
||||||
attr_accessor :hash, :tokens, :sentences, :mentions, :keywords
|
attr_accessor :hash, :tokens, :sentences, :mentions, :keywords
|
||||||
|
|
||||||
# Consume a corpus file to create a model
|
def self.consume(txtpath)
|
||||||
# @param corpus_path Path to a json, text or csv file to consume
|
Model.new.consume(txtpath)
|
||||||
# @param cache Optional path to a directory to store cached models
|
|
||||||
def self.consume(corpus_path, cache: nil)
|
|
||||||
if cache
|
|
||||||
FileUtils::mkdir_p cache
|
|
||||||
|
|
||||||
cache_path = File.join(cache, Digest::MD5.file(corpus_path).to_s)
|
|
||||||
if File.exists?(cache_path)
|
|
||||||
log "Reading model from cache at #{cache_path}"
|
|
||||||
return Model.load(cache_path)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
model = Model.new.consume(corpus_path)
|
|
||||||
|
|
||||||
if cache
|
|
||||||
log "Caching model at #{cache_path}"
|
|
||||||
model.save(cache_path)
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.consume_all(paths)
|
def self.consume_all(paths)
|
||||||
|
|||||||
Reference in New Issue
Block a user