11. Python参考コード
[kytea.py]
import sys
import commands
if __name__ == "__main__":
file = sys.argv[1]
model = sys.argv[2]
#read file
raw = open(file).read()
print "----- origin -----"
print raw #元データの表示
print "----- morphology -----" #split word
c = 'kytea -model ' + model + ' < ' + file
result = commands.getstatusoutput(c) #kyteaのコマンドを実行
for r in result[1].split(): #わかち書き結果の表示
print r
11
18. NLTK実行例(数え上げ)
[freq.py]
#ワードをn-gramモデルに基づいて解析+出現頻度の数え上げ
def ngrams(words, ngram, limit):
ngrams = nltk.ngrams(words, ngram) #ワード群をn-gram化
fd = nltk.FreqDist(ngrams) #数え上げ
result = {}
for f in fd: #FreqDist構造を普通のdictionaryに変換(非クール)
r = ""
for n in range(0, ngram):
if n > 0:
r += " "
r += f[n]
result[r] = fd[f]
c = 0 #出現頻度多い順にソートして標準出力
for k,v in sorted(result.items(), key=lambda x:x[1], reverse=True):
c += 1
if limit > 0:
if c > limit:
break
print k + "t" + str(result[k])
18
25. Map参考コード
[map.py]
if __name__ == "__main__":
for line in sys.stdin: #標準入力から1行づつ読み込み
line = line.strip()
words = parse(line)
fd = nltk.FreqDist(words)
for f in fd:
print f + “¥t” fd[f] #標準出力にKey/Valueのペアを出力
25
26. Reduce参考コード
[reduce.py]
if __name__ == "__main__":
word2count = {}
for line in sys.stdin: #標準入力からKey/Valueを取得
line = line.strip()
word, count = line.split('t', 1)
try:
count = int(count)
word2count[word] = word2count.get(word, 0) + count
exceptValueError:
pass
sorted_word2count = sorted(word2count.items(), key=itemgetter(0))
for word, count in sorted_word2count:
print '%st%s'% (word, count) #標準出力にKey/Valueのペアを出力
26