Tweaking sentence tokenization, support for contractions

This commit is contained in:
Jacob Harris
2013-08-17 21:38:14 -04:00
parent 7061990d1e
commit b190084a02
5 changed files with 107 additions and 27 deletions

View File

@@ -1,5 +1,8 @@
#encoding: UTF-8
CONTRACTION_APOSTROPHE_SUBSTITUTE = 'qqq'
CONTRACTIONS = %w(aren't can't couldn't didn't doesn't don't hadn't hasn't haven't he'd he'll he's I'd I'll I'm I've isn't it's let's mightn't mustn't shan't she'd she'll she's shouldn't that's there's they'd they'll they're they've we'd we're we've weren't what'll what're what's what've where's who'd who'll who're who's who've won't wouldn't you'd you'll you're you've)
class MarkovChainer
attr_reader :order
def initialize(order)
@@ -9,19 +12,22 @@ class MarkovChainer
end
def add_text(text)
# make sure each paragraph ends with some sentence terminator
text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".")
text << "."
seps = /(\p{Punct})/
sentence = ""
text.split(seps).each { |p|
if seps =~ p
add_sentence(sentence, p)
sentence = ""
else
sentence = p
end
}
# remove curly apostrophes
text.gsub!(//, "'")
# make sure each paragraph ends with some sentence terminator
text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".")
text << "."
seps = /[.:;?!]/
sentence = ""
text.split(seps).each { |p|
if seps =~ p
add_sentence(sentence, p)
sentence = ""
else
sentence = p
end
}
end
def generate_sentence
@@ -29,14 +35,32 @@ class MarkovChainer
return nil if res.nil?
loop {
unless nw = next_word_for(res[-order, order])
return res[0..-2].join(" ") + res.last
out = res[0..-2].join(" ") + (res.last || '.')
out.gsub!(CONTRACTION_APOSTROPHE_SUBSTITUTE, "'")
return out
end
res << nw
}
end
def add_sentence(sentence)
if sentence =~ /[.:;?!]$/
add_sentence_internal(sentence[0, sentence.length - 1], sentence[sentence.length - 1, 1])
else
add_sentence_internal(sentence, '.')
end
end
private
def add_sentence(str, terminator)
def add_sentence_internal(str, terminator)
str.gsub!(//, "'")
CONTRACTIONS.each do |c|
str.gsub!(/#{c}/i) {|m| m.gsub("'", CONTRACTION_APOSTROPHE_SUBSTITUTE)}
end
str.gsub!(/'s/, CONTRACTION_APOSTROPHE_SUBSTITUTE)
#puts str
words = str.scan(/[\p{Word}'\-]+/)
return unless words.size > order # ignore short sentences
words << terminator