mirror of
https://github.com/thewesker/iron_ebooks.git
synced 2025-12-20 04:11:12 -05:00
Tweaking sentence tokenization, support for contractions
This commit is contained in:
54
markov.rb
54
markov.rb
@@ -1,5 +1,8 @@
|
||||
#encoding: UTF-8
|
||||
|
||||
CONTRACTION_APOSTROPHE_SUBSTITUTE = 'qqq'
|
||||
CONTRACTIONS = %w(aren't can't couldn't didn't doesn't don't hadn't hasn't haven't he'd he'll he's I'd I'll I'm I've isn't it's let's mightn't mustn't shan't she'd she'll she's shouldn't that's there's they'd they'll they're they've we'd we're we've weren't what'll what're what's what've where's who'd who'll who're who's who've won't wouldn't you'd you'll you're you've)
|
||||
|
||||
class MarkovChainer
|
||||
attr_reader :order
|
||||
def initialize(order)
|
||||
@@ -9,19 +12,22 @@ class MarkovChainer
|
||||
end
|
||||
|
||||
def add_text(text)
|
||||
# make sure each paragraph ends with some sentence terminator
|
||||
text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".")
|
||||
text << "."
|
||||
seps = /(\p{Punct})/
|
||||
sentence = ""
|
||||
text.split(seps).each { |p|
|
||||
if seps =~ p
|
||||
add_sentence(sentence, p)
|
||||
sentence = ""
|
||||
else
|
||||
sentence = p
|
||||
end
|
||||
}
|
||||
# remove curly apostrophes
|
||||
text.gsub!(/’/, "'")
|
||||
|
||||
# make sure each paragraph ends with some sentence terminator
|
||||
text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".")
|
||||
text << "."
|
||||
seps = /[.:;?!]/
|
||||
sentence = ""
|
||||
text.split(seps).each { |p|
|
||||
if seps =~ p
|
||||
add_sentence(sentence, p)
|
||||
sentence = ""
|
||||
else
|
||||
sentence = p
|
||||
end
|
||||
}
|
||||
end
|
||||
|
||||
def generate_sentence
|
||||
@@ -29,14 +35,32 @@ class MarkovChainer
|
||||
return nil if res.nil?
|
||||
loop {
|
||||
unless nw = next_word_for(res[-order, order])
|
||||
return res[0..-2].join(" ") + res.last
|
||||
out = res[0..-2].join(" ") + (res.last || '.')
|
||||
out.gsub!(CONTRACTION_APOSTROPHE_SUBSTITUTE, "'")
|
||||
return out
|
||||
end
|
||||
res << nw
|
||||
}
|
||||
end
|
||||
|
||||
def add_sentence(sentence)
|
||||
if sentence =~ /[.:;?!]$/
|
||||
add_sentence_internal(sentence[0, sentence.length - 1], sentence[sentence.length - 1, 1])
|
||||
else
|
||||
add_sentence_internal(sentence, '.')
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
def add_sentence(str, terminator)
|
||||
def add_sentence_internal(str, terminator)
|
||||
str.gsub!(/’/, "'")
|
||||
|
||||
CONTRACTIONS.each do |c|
|
||||
str.gsub!(/#{c}/i) {|m| m.gsub("'", CONTRACTION_APOSTROPHE_SUBSTITUTE)}
|
||||
end
|
||||
str.gsub!(/'s/, CONTRACTION_APOSTROPHE_SUBSTITUTE)
|
||||
|
||||
#puts str
|
||||
words = str.scan(/[\p{Word}'’\-]+/)
|
||||
return unless words.size > order # ignore short sentences
|
||||
words << terminator
|
||||
|
||||
Reference in New Issue
Block a user