怎样计算两个文本的Rougel,Bleu评价指标
怎样计算两个文本的Rougel评价指标一、问题描述想要计算两句话的Rougel值二 、程序实现1. rougel.py#!/usr/bin/env python## File Name : rouge.py## Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)## Creation Date : 2
·
怎样计算两个文本的Rougel评价指标
一、问题描述
想要计算两句话的Rougel值
二 、程序实现
1. rougel.py
#!/usr/bin/env python
#
# File Name : rouge.py
#
# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
#
# Creation Date : 2015-01-07 06:03
# Author : Ramakrishna Vedantam <vrama91@vt.edu>
import numpy as np
import pdb
def my_lcs(string, sub):
"""
Calculates longest common subsequence for a pair of tokenized strings
:param string : list of str : tokens from a string split using whitespace
:param sub : list of str : shorter string, also split using whitespace
:returns: length (list of int): length of the longest common subsequence between the two strings
Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
"""
if(len(string)< len(sub)):
sub, string = string, sub
lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
for j in range(1,len(sub)+1):
for i in range(1,len(string)+1):
if(string[i-1] == sub[j-1]):
lengths[i][j] = lengths[i-1][j-1] + 1
else:
lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
return lengths[len(string)][len(sub)]
class Rouge():
'''
Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
'''
def __init__(self):
# vrama91: updated the value below based on discussion with Hovey
self.beta = 1.2
def calc_score(self, candidate, refs):
"""
Compute ROUGE-L score given one candidate and references for an image
:param candidate: str : candidate sentence to be evaluated
:param refs: list of str : COCO reference sentences for the particular image to be evaluated
:returns score: int (ROUGE-L score for the candidate evaluated against references)
"""
assert(len(candidate)==1)
assert(len(refs)>0)
prec = []
rec = []
# split into tokens
token_c = candidate[0].split(" ")
for reference in refs:
# split into tokens
token_r = reference.split(" ")
# compute the longest common subsequence
lcs = my_lcs(token_r, token_c)
prec.append(lcs/float(len(token_c)))
rec.append(lcs/float(len(token_r)))
prec_max = max(prec)
rec_max = max(rec)
if(prec_max!=0 and rec_max !=0):
score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
else:
score = 0.0
return score
def compute_score(self, gts, res):
"""
Computes Rouge-L score given a set of reference and candidate sentences for the dataset
Invoked by evaluate_captions.py
:param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
:param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
:returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
"""
assert(gts.keys() == res.keys())
imgIds = gts.keys()
score = []
for id in imgIds:
hypo = res[id]
ref = gts[id]
score.append(self.calc_score(hypo, ref))
# Sanity check.
assert(type(hypo) is list)
assert(len(hypo) == 1)
assert(type(ref) is list)
assert(len(ref) > 0)
average_score = np.mean(np.array(score))
# return average_score, np.array(score) # average_score represents all rougel, np.array(score) represent all single rougle in array formatter
return np.array(score)
def method(self):
return "Rouge"
2.bleu_scorer.py
#!/usr/bin/env python
# bleu_scorer.py
# David Chiang <chiang@isi.edu>
# Copyright (c) 2004-2006 University of Maryland. All rights
# reserved. Do not redistribute without permission from the
# author. Not for commercial use.
# Modified by:
# Hao Fang <hfang@uw.edu>
# Tsung-Yi Lin <tl483@cornell.edu>
'''Provides:
cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
'''
import copy
import sys, math, re
from collections import defaultdict
import six
from six.moves import xrange as range
def precook(s, n=4, out=False):
"""Takes a string as input and returns an object that can be given to
either cook_refs or cook_test. This is optional: cook_refs and cook_test
can take string arguments as well."""
words = s.split()
counts = defaultdict(int)
for k in range(1,n+1):
for i in range(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return (len(words), counts)
def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
'''Takes a list of reference sentences for a single segment
and returns an object that encapsulates everything that BLEU
needs to know about them.'''
reflen = []
maxcounts = {}
for ref in refs:
rl, counts = precook(ref, n)
reflen.append(rl)
for (ngram,count) in six.iteritems(counts):
maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
# Calculate effective reference sentence length.
if eff == "shortest":
reflen = min(reflen)
elif eff == "average":
reflen = float(sum(reflen))/len(reflen)
## lhuang: N.B.: leave reflen computaiton to the very end!!
## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
return (reflen, maxcounts)
def cook_test(test, reflen_refmaxcounts, eff=None, n=4):
'''Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.'''
reflen, refmaxcounts = reflen_refmaxcounts
testlen, counts = precook(test, n, True)
result = {}
# Calculate effective reference sentence length.
if eff == "closest":
result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
else: ## i.e., "average" or "shortest" or None
result["reflen"] = reflen
result["testlen"] = testlen
result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
result['correct'] = [0]*n
for (ngram, count) in six.iteritems(counts):
result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
return result
class BleuScorer(object):
"""Bleu scorer.
"""
__slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
# special_reflen is used in oracle (proportional effective ref len for a node).
def copy(self):
''' copy the refs.'''
new = BleuScorer(n=self.n)
new.ctest = copy.copy(self.ctest)
new.crefs = copy.copy(self.crefs)
new._score = None
return new
def __init__(self, test=None, refs=None, n=4, special_reflen=None):
''' singular instance '''
self.n = n
self.crefs = []
self.ctest = []
self.cook_append(test, refs)
self.special_reflen = special_reflen
def cook_append(self, test, refs):
'''called by constructor and __iadd__ to avoid creating new instances.'''
if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
cooked_test = cook_test(test, self.crefs[-1])
self.ctest.append(cooked_test) ## N.B.: -1
else:
self.ctest.append(None) # lens of crefs and ctest have to match
self._score = None ## need to recompute
def ratio(self, option=None):
self.compute_score(option=option)
return self._ratio
def score_ratio(self, option=None):
'''return (bleu, len_ratio) pair'''
return (self.fscore(option=option), self.ratio(option=option))
def score_ratio_str(self, option=None):
return "%.4f (%.2f)" % self.score_ratio(option)
def reflen(self, option=None):
self.compute_score(option=option)
return self._reflen
def testlen(self, option=None):
self.compute_score(option=option)
return self._testlen
def retest(self, new_test):
if type(new_test) is str:
new_test = [new_test]
assert len(new_test) == len(self.crefs), new_test
self.ctest = []
for t, rs in zip(new_test, self.crefs):
self.ctest.append(cook_test(t, rs))
self._score = None
return self
def rescore(self, new_test):
''' replace test(s) with new test(s), and returns the new score.'''
return self.retest(new_test).compute_score()
def size(self):
assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
return len(self.crefs)
def __iadd__(self, other):
'''add an instance (e.g., from another sentence).'''
if type(other) is tuple:
## avoid creating new BleuScorer instances
self.cook_append(other[0], other[1])
else:
assert self.compatible(other), "incompatible BLEUs."
self.ctest.extend(other.ctest)
self.crefs.extend(other.crefs)
self._score = None ## need to recompute
return self
def compatible(self, other):
return isinstance(other, BleuScorer) and self.n == other.n
def single_reflen(self, option="average"):
return self._single_reflen(self.crefs[0][0], option)
def _single_reflen(self, reflens, option=None, testlen=None):
if option == "shortest":
reflen = min(reflens)
elif option == "average":
reflen = float(sum(reflens))/len(reflens)
elif option == "closest":
reflen = min((abs(l-testlen), l) for l in reflens)[1]
else:
assert False, "unsupported reflen option %s" % option
return reflen
def recompute_score(self, option=None, verbose=0):
self._score = None
return self.compute_score(option, verbose)
def compute_score(self, option=None, verbose=0):
n = self.n
small = 1e-9
tiny = 1e-15 ## so that if guess is 0 still return 0
bleu_list = [[] for _ in range(n)]
if self._score is not None:
return self._score
if option is None:
option = "average" if len(self.crefs) == 1 else "closest"
self._testlen = 0
self._reflen = 0
totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
# for each sentence
for comps in self.ctest:
testlen = comps['testlen']
self._testlen += testlen
if self.special_reflen is None: ## need computation
reflen = self._single_reflen(comps['reflen'], option, testlen)
else:
reflen = self.special_reflen
self._reflen += reflen
for key in ['guess','correct']:
for k in range(n):
totalcomps[key][k] += comps[key][k]
# append per image bleu score
bleu = 1.
for k in range(n):
bleu *= (float(comps['correct'][k]) + tiny) \
/(float(comps['guess'][k]) + small)
bleu_list[k].append(bleu ** (1./(k+1)))
ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in range(n):
bleu_list[k][-1] *= math.exp(1 - 1/ratio)
if verbose > 1:
print(comps, reflen)
totalcomps['reflen'] = self._reflen
totalcomps['testlen'] = self._testlen
bleus = []
bleu = 1.
for k in range(n):
bleu *= float(totalcomps['correct'][k] + tiny) \
/ (totalcomps['guess'][k] + small)
bleus.append(bleu ** (1./(k+1)))
ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in range(n):
bleus[k] *= math.exp(1 - 1/ratio)
if verbose > 0:
print(totalcomps)
print("ratio:", ratio)
self._score = bleus
return self._score, bleu_list
3. bleu.py
#!/usr/bin/env python
#
# File Name : bleu.py
#
# Description : Wrapper for BLEU scorer.
#
# Creation Date : 06-01-2015
# Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
# Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
from bleu_scorer import BleuScorer
class Bleu:
def __init__(self, n=4):
# default compute Blue score up to 4
self._n = n
self._hypo_for_image = {}
self.ref_for_image = {}
def compute_score(self, gts, res):
assert(gts.keys() == res.keys())
imgIds = gts.keys()
bleu_scorer = BleuScorer(n=self._n)
for id in imgIds:
hypo = res[id]
ref = gts[id]
# Sanity check.
assert(type(hypo) is list)
assert(len(hypo) == 1)
assert(type(ref) is list)
assert(len(ref) >= 1)
bleu_scorer += (hypo[0], ref)
#score, scores = bleu_scorer.compute_score(option='shortest')
score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
#score, scores = bleu_scorer.compute_score(option='average', verbose=1)
# return (bleu, bleu_info)
# return score, scores # return total_blue(score) and all four singel blue(scores)
return scores[3] # scores[3] represents BLEU4, in this experiment,we only use bleu4
def method(self):
return "Bleu"
4. 主函数,metric.py
from numpy.lib.shape_base import tile
from bleu import Bleu
from rougel import Rouge
import json
import numpy as np
#Function: 1) 计算两个按行存取的txt的句子对的ROUGEL和Bleu值(需要将按行存取的TXT修改成字典的格式,参考上一篇博客)。2)计算ROUGEL+BLEU的值,并将和由高到低排序 3)
bleu_scorer = Bleu(4)
rouge_scorer = Rouge()
## 将原始按行存入记事本的文件(仅含有sentence),修改成字典形式 eg:groundtruth = {0: ["i love china"], 1:["today is a good day"]} # {}:dict
# 从0开始
def Readfile(file_name):
line_list = []
dic = {}
fr = open(file_name,"r")
line_number = 0
for line in fr.readlines():
line = line.strip()
if len(line) != 0:
line_list.append(line)
dic[line_number] = [line]
line_number += 1
# fw.write(json.dumps(dic))
# fw.write(dic)
return dic
fr.close()
fw.close()
if __name__=="__main__":
raw_name = "/home/qtxu/semeval_data/sentences/restaurants_16.train" # original data
bt_name = "/home/qtxu/semeval_data/bt/french/sentences/restaurants_16.train" # bt data
write_raw_name = "/home/qtxu/semeval_data/Ten_Metric/new_order_sentence/french/r16_fr_neworder.txt"
write_bt_name = "/home/qtxu/semeval_data/Ten_Metric/new_bt_order_sentence/french/r16_bt_fr_neworder.txt" # interlanguages neworder data
raw_dic = Readfile(raw_name)
print(raw_dic)
bt_dic = Readfile(bt_name)
Bleu_value = bleu_scorer.compute_score(raw_dic, bt_dic) # list
Rougle_value = rouge_scorer.compute_score(raw_dic, bt_dic) # list
# print(Bleu_value)
# print(Rougle_value)
list_final = [] #用于存放两个评价指标的计算和
i = 0
for index, item in enumerate(Bleu_value):
list_final.append(item+Rougle_value[index]) #add two metric(bleu and rouge)
# print(list_final)
# print("排序结果是",sorted(list_final)) # sorted() default 升序
sorted_id = sorted(range(len(list_final)), key=lambda k: list_final[k], reverse=True) # reverse=True 降序,sorted_id从0开始
# print(sorted_id)
result = [raw_dic[i][0] for i in sorted_id] # result:sorted_id对应的,原始的txt中的sentence
# print(result)
# fw_raw = open(write_raw_name,"w")
# for i in sorted_id:
# fw_raw.write(raw_dic[i][0]+"\n")
fw_bt = open(write_bt_name,"w")
for i in sorted_id:
fw_bt.write(bt_dic[i][0]+"\n")
# print(bt_dic[i][0])
print("write over")
更多推荐
已为社区贡献11条内容
所有评论(0)