mirror of
https://github.com/thewesker/iron_ebooks.git
synced 2025-12-20 04:11:12 -05:00
Don't tokenize empty tweets
This commit is contained in:
1
ebook.rb
1
ebook.rb
@@ -82,6 +82,7 @@ else
|
|||||||
tokenizer = Punkt::SentenceTokenizer.new(source_tweets.join(" ")) # init with corpus of all sentences
|
tokenizer = Punkt::SentenceTokenizer.new(source_tweets.join(" ")) # init with corpus of all sentences
|
||||||
|
|
||||||
source_tweets.each do |twt|
|
source_tweets.each do |twt|
|
||||||
|
next if twt.nil? || twt == ''
|
||||||
sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text)
|
sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text)
|
||||||
|
|
||||||
# sentences = text.split(/[.:;?!]/)
|
# sentences = text.split(/[.:;?!]/)
|
||||||
|
|||||||
Reference in New Issue
Block a user