mirror of
https://github.com/thewesker/twitter_ebooks.git
synced 2025-12-20 04:11:08 -05:00
2.2.5 - encoding: utf-8
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env ruby
|
#!/usr/bin/env ruby
|
||||||
|
# encoding: utf-8
|
||||||
|
|
||||||
require 'twitter_ebooks'
|
require 'twitter_ebooks'
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env ruby
|
#!/usr/bin/env ruby
|
||||||
|
# encoding: utf-8
|
||||||
require 'twitter'
|
require 'twitter'
|
||||||
require 'tweetstream'
|
require 'tweetstream'
|
||||||
require 'rufus/scheduler'
|
require 'rufus/scheduler'
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
# encoding: utf-8
|
||||||
|
|
||||||
module Ebooks
|
module Ebooks
|
||||||
# This generator uses data identical to the markov model, but
|
# This generator uses data identical to the markov model, but
|
||||||
# instead of making a chain by looking up bigrams it uses the
|
# instead of making a chain by looking up bigrams it uses the
|
||||||
@@ -50,7 +52,7 @@ module Ebooks
|
|||||||
tokens.each_with_index do |token, i|
|
tokens.each_with_index do |token, i|
|
||||||
next_token = tokens[i+1]
|
next_token = tokens[i+1]
|
||||||
break if next_token.nil?
|
break if next_token.nil?
|
||||||
|
|
||||||
alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
|
alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
|
||||||
# Filter out suffixes from previous sentences
|
# Filter out suffixes from previous sentences
|
||||||
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
|
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
|
||||||
@@ -66,7 +68,7 @@ module Ebooks
|
|||||||
verbatim << @sentences[alt[0]]
|
verbatim << @sentences[alt[0]]
|
||||||
suffix = @sentences[alt[0]][alt[1]..-1]
|
suffix = @sentences[alt[0]][alt[1]..-1]
|
||||||
potential = tokens[0..start+1] + suffix
|
potential = tokens[0..start+1] + suffix
|
||||||
|
|
||||||
# Ensure we're not just rebuilding some segment of another sentence
|
# Ensure we're not just rebuilding some segment of another sentence
|
||||||
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
|
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
|
||||||
used << alt[0]
|
used << alt[0]
|
||||||
|
|||||||
Reference in New Issue
Block a user