Cleaning

2026-02-04 19:45:21 -05:00 · 2013-11-14 08:30:45 -08:00
parent 00f0228dd4
commit e4209f79e4
1 changed files with 6 additions and 1 deletions
--- a/lib/twitter_ebooks/suffix.rb
+++ b/lib/twitter_ebooks/suffix.rb
@@ -1,4 +1,8 @@
 module Ebooks
+  # This generator uses data identical to the markov model, but
+  # instead of making a chain by looking up bigrams it uses the
+  # positions to randomly replace suffixes in one sentence with
+  # matching suffixes in another
  class SuffixGenerator
    def self.build(sentences)
      SuffixGenerator.new(sentences)
@@ -48,6 +52,7 @@ module Ebooks
          break if next_token.nil?
          
          alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
+          # Filter out suffixes from previous sentences
          alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
          varsites[i] = alternatives unless alternatives.empty?
        end
@@ -62,6 +67,7 @@ module Ebooks
            suffix = @sentences[alt[0]][alt[1]..-1]
            potential = tokens[0..start+1] + suffix
            
+            # Ensure we're not just rebuilding some segment of another sentence
            unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
              used << alt[0]
              variant = potential
@@ -75,7 +81,6 @@ module Ebooks
        tokens = variant if variant
      end

-
      tokens
    end
  end