def self.train_tc
profiles = []
languages = Dir.glob("textcat_ngrams/*.lm").collect {|l| l.gsub(/\.lm$/,'')}.sort
languages.each do |language|
ngram = {}
rang = 1
lang = File.open("#{language}.lm", "r")
lang.each_line do |line|
line = line.chomp
if line =~ /^[^0-9\s]+/o
ngram[line.chomp.split(/\t/).first] = rang
rang += 1
end
end
lang.close
p = LanguageDetector::Profile.new(:name => language.split('/').last.split('-').first)
p.ngrams = ngram
profiles.push p
end
puts 'saving model...'
filename = File.expand_path(File.join(File.dirname(__FILE__), "model-tc.yml"))
File.open(filename, 'w') {|f| YAML.dump(profiles, f)}
end