From 9a9de4dc007b2d4003f63f9aeea35ec39f505ac5 Mon Sep 17 00:00:00 2001 From: hankcs Date: Thu, 20 May 2021 14:01:03 -0400 Subject: [PATCH] Remove iwpt eval scripts --- hanlp/metrics/parsing/iwpt20_eval.py | 154 ----- hanlp/metrics/parsing/iwpt20_xud_eval.py | 766 ----------------------- 2 files changed, 920 deletions(-) delete mode 100644 hanlp/metrics/parsing/iwpt20_eval.py delete mode 100644 hanlp/metrics/parsing/iwpt20_xud_eval.py diff --git a/hanlp/metrics/parsing/iwpt20_eval.py b/hanlp/metrics/parsing/iwpt20_eval.py deleted file mode 100644 index 0b322f4b1..000000000 --- a/hanlp/metrics/parsing/iwpt20_eval.py +++ /dev/null @@ -1,154 +0,0 @@ -# -*- coding:utf-8 -*- -# Author: hankcs -# Date: 2020-03-25 16:04 - -import os -import tempfile -from typing import List - -from hanlp.metrics.parsing.conllx_eval import copy_cols - -from hanlp_common.structure import SerializableDict -from hanlp.metrics.parsing import iwpt20_xud_eval -from hanlp.metrics.parsing.iwpt20_xud_eval import load_conllu_file -from hanlp.utils.io_util import get_resource, get_exitcode_stdout_stderr - -UD_TOOLS_ROOT = get_resource( - 'https://github.com/UniversalDependencies/tools/archive/1650bd354bd158c75836cff6650ea35cc9928fc8.zip') - -ENHANCED_COLLAPSE_EMPTY_NODES = os.path.join(UD_TOOLS_ROOT, 'enhanced_collapse_empty_nodes.pl') -CONLLU_QUICK_FIX = os.path.join(UD_TOOLS_ROOT, 'conllu-quick-fix.pl') - - -def run_perl(script, src, dst=None): - if not dst: - dst = tempfile.NamedTemporaryFile().name - exitcode, out, err = get_exitcode_stdout_stderr( - f'perl -I{os.path.expanduser("~/.local/lib/perl5")} {script} {src}') - if exitcode: - # cpanm -l ~/.local namespace::autoclean - # cpanm -l ~/.local Moose - # cpanm -l ~/.local MooseX::SemiAffordanceAccessor module - raise RuntimeError(err) - with open(dst, 'w') as ofile: - ofile.write(out) - return dst - - -def enhanced_collapse_empty_nodes(src, dst=None): - return run_perl(ENHANCED_COLLAPSE_EMPTY_NODES, src, dst) - - -def conllu_quick_fix(src, dst=None): - return run_perl(CONLLU_QUICK_FIX, src, dst) - - -def load_conll_to_str(path) -> List[str]: - with open(path) as src: - text = src.read() - sents = text.split('\n\n') - sents = [x for x in sents if x.strip()] - return sents - - -def remove_complete_edges(src, dst): - sents = load_conll_to_str(src) - with open(dst, 'w') as out: - for each in sents: - for line in each.split('\n'): - if line.startswith('#'): - out.write(line) - else: - cells = line.split('\t') - cells[7] = cells[7].split(':')[0] - out.write('\t'.join(cells)) - out.write('\n') - out.write('\n') - - -def remove_collapse_edges(src, dst): - sents = load_conll_to_str(src) - with open(dst, 'w') as out: - for each in sents: - for line in each.split('\n'): - if line.startswith('#'): - out.write(line) - else: - cells = line.split('\t') - deps = cells[8].split('|') - deps = [x.split('>')[0] for x in deps] - cells[8] = '|'.join(deps) - out.write('\t'.join(cells)) - out.write('\n') - out.write('\n') - - -def restore_collapse_edges(src, dst): - sents = load_conll_to_str(src) - with open(dst, 'w') as out: - for each in sents: - empty_nodes = {} # head to deps - lines = each.split('\n') - tokens = [x for x in lines if not x.startswith('#') and x.split()[0].isdigit()] - for line in lines: - line = line.strip() - if not line: - continue - if line.startswith('#'): - out.write(line) - else: - cells = line.split('\t') - deps = cells[8].split('|') - for i, d in enumerate(deps): - if '>' in d: - head, rel = d.split(':', 1) - ehead = f'{len(tokens)}.{len(empty_nodes) + 1}' - par, cur = rel.split('>', 1) - cur = cur.split('>')[0] - deps[i] = f'{ehead}:{cur}' - empty_nodes[ehead] = f'{head}:{par}' - cells[8] = '|'.join(deps) - out.write('\t'.join(cells)) - out.write('\n') - num_tokens = int(line.split('\t')[0]) - assert num_tokens == len(tokens) - for idx, (ehead, deps) in enumerate(empty_nodes.items()): - out.write(f'{num_tokens}.{idx + 1}\t' + '_\t' * 7 + deps + '\t_\n') - out.write('\n') - - -def evaluate(gold_file, pred_file, do_enhanced_collapse_empty_nodes=False, do_copy_cols=True): - """Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski) - - Args: - gold_file(str): The gold conllx file - pred_file(str): The pred conllx file - do_enhanced_collapse_empty_nodes: (Default value = False) - do_copy_cols: (Default value = True) - - Returns: - - - """ - if do_enhanced_collapse_empty_nodes: - gold_file = enhanced_collapse_empty_nodes(gold_file) - pred_file = enhanced_collapse_empty_nodes(pred_file) - if do_copy_cols: - fixed_pred_file = pred_file.replace('.conllu', '.fixed.conllu') - copy_cols(gold_file, pred_file, fixed_pred_file) - else: - fixed_pred_file = pred_file - args = SerializableDict() - args.enhancements = '0' - args.gold_file = gold_file - args.system_file = fixed_pred_file - return iwpt20_xud_eval.evaluate_wrapper(args) - - -def main(): - print(evaluate('data/iwpt2020/iwpt2020-test-gold/cs.conllu', - 'data/model/iwpt2020/bert/ens/cs.conllu', do_enhanced_collapse_empty_nodes=True)) - - -if __name__ == '__main__': - main() diff --git a/hanlp/metrics/parsing/iwpt20_xud_eval.py b/hanlp/metrics/parsing/iwpt20_xud_eval.py deleted file mode 100644 index 1b59ac1b8..000000000 --- a/hanlp/metrics/parsing/iwpt20_xud_eval.py +++ /dev/null @@ -1,766 +0,0 @@ -#!/usr/bin/env python3 - -# updated code from conll 2018 ud shared task for evaluation of enhanced dependencies in iwpt 2020 shared task -# -- read DEPS, split on '|', compute overlap -# Gosse Bouma - -# Compatible with Python 2.7 and 3.2+, can be used either as a module -# or a standalone executable. -# -# Copyright 2017, 2018 Institute of Formal and Applied Linguistics (UFAL), -# Faculty of Mathematics and Physics, Charles University, Czech Republic. -# -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -# -# Authors: Milan Straka, Martin Popel -# -# Changelog: -# - [12 Apr 2018] Version 0.9: Initial release. -# - [19 Apr 2018] Version 1.0: Fix bug in MLAS (duplicate entries in functional_children). -# Add --counts option. -# - [02 May 2018] Version 1.1: When removing spaces to match gold and system characters, -# consider all Unicode characters of category Zs instead of -# just ASCII space. -# - [25 Jun 2018] Version 1.2: Use python3 in the she-bang (instead of python). -# In Python2, make the whole computation use `unicode` strings. - -# Command line usage -# ------------------ -# iwpt20_eud_eval.py3 [-v] [-c] gold_conllu_file system_conllu_file -# -# - if no -v is given, only the official IWPT 2020 Shared Task evaluation metrics -# are printed -# - if -v is given, more metrics are printed (as precision, recall, F1 score, -# and in case the metric is computed on aligned words also accuracy on these): -# - Tokens: how well do the gold tokens match system tokens -# - Sentences: how well do the gold sentences match system sentences -# - Words: how well can the gold words be aligned to system words -# - UPOS: using aligned words, how well does UPOS match -# - XPOS: using aligned words, how well does XPOS match -# - UFeats: using aligned words, how well does universal FEATS match -# - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match -# - Lemmas: using aligned words, how well does LEMMA match -# - UAS: using aligned words, how well does HEAD match -# - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match -# - CLAS: using aligned words with content DEPREL, how well does -# HEAD+DEPREL(ignoring subtypes) match -# - MLAS: using aligned words with content DEPREL, how well does -# HEAD+DEPREL(ignoring subtypes)+UPOS+UFEATS+FunctionalChildren(DEPREL+UPOS+UFEATS) match -# - BLEX: using aligned words with content DEPREL, how well does -# HEAD+DEPREL(ignoring subtypes)+LEMMAS match -# - if -c is given, raw counts of correct/gold_total/system_total/aligned words are printed -# instead of precision/recall/F1/AlignedAccuracy for all metrics. - -# API usage -# --------- -# - load_conllu(file) -# - loads CoNLL-U file from given file object to an internal representation -# - the file object should return str in both Python 2 and Python 3 -# - raises UDError exception if the given file cannot be loaded -# - evaluate(gold_ud, system_ud) -# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu) -# - raises UDError if the concatenated tokens of gold and system file do not match -# - returns a dictionary with the metrics described above, each metric having -# three fields: precision, recall and f1 - -# Description of token matching -# ----------------------------- -# In order to match tokens of gold file and system file, we consider the text -# resulting from concatenation of gold tokens and text resulting from -# concatenation of system tokens. These texts should match -- if they do not, -# the evaluation fails. -# -# If the texts do match, every token is represented as a range in this original -# text, and tokens are equal only if their range is the same. - -# Description of word matching -# ---------------------------- -# When matching words of gold file and system file, we first match the tokens. -# The words which are also tokens are matched as tokens, but words in multi-word -# tokens have to be handled differently. -# -# To handle multi-word tokens, we start by finding "multi-word spans". -# Multi-word span is a span in the original text such that -# - it contains at least one multi-word token -# - all multi-word tokens in the span (considering both gold and system ones) -# are completely inside the span (i.e., they do not "stick out") -# - the multi-word span is as small as possible -# -# For every multi-word span, we align the gold and system words completely -# inside this span using LCS on their FORMs. The words not intersecting -# (even partially) any multi-word span are then aligned as tokens. - - -from __future__ import division -from __future__ import print_function - -import argparse -import io -import sys -import unicodedata -import unittest - -# CoNLL-U column names -ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) - -# Content and functional relations -CONTENT_DEPRELS = { - "nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative", - "expl", "dislocated", "advcl", "advmod", "discourse", "nmod", "appos", - "nummod", "acl", "amod", "conj", "fixed", "flat", "compound", "list", - "parataxis", "orphan", "goeswith", "reparandum", "root", "dep" -} - -FUNCTIONAL_DEPRELS = { - "aux", "cop", "mark", "det", "clf", "case", "cc" -} - -UNIVERSAL_FEATURES = { - "PronType", "NumType", "Poss", "Reflex", "Foreign", "Abbr", "Gender", - "Animacy", "Number", "Case", "Definite", "Degree", "VerbForm", "Mood", - "Tense", "Aspect", "Voice", "Evident", "Polarity", "Person", "Polite" -} - -# UD Error is used when raising exceptions in this module -class UDError(Exception): - pass - -# Conversion methods handling `str` <-> `unicode` conversions in Python2 -def _decode(text): - return text if sys.version_info[0] >= 3 or not isinstance(text, str) else text.decode("utf-8") - -def _encode(text): - return text if sys.version_info[0] >= 3 or not isinstance(text, unicode) else text.encode("utf-8") - -CASE_DEPRELS = {'obl','nmod','conj','advcl'} -UNIVERSAL_DEPREL_EXTENSIONS = {'pass','relcl','xsubj'} - -# modify the set of deps produced by system to be in accordance with gold treebank type -# return a (filtered) list of (hd,dependency_path) tuples. -- GB -def process_enhanced_deps(deps) : - edeps = [] - for edep in deps.split('|') : - (hd,path) = edep.split(':',1) - steps = path.split('>') # collapsing empty nodes gives rise to paths like this : 3:conj:en>obl:voor - edeps.append((hd,steps)) # (3,['conj:en','obj:voor']) - return edeps - -# Load given CoNLL-U file into internal representation -def load_conllu(file,treebank_type): - # Internal representation classes - class UDRepresentation: - def __init__(self): - # Characters of all the tokens in the whole file. - # Whitespace between tokens is not included. - self.characters = [] - # List of UDSpan instances with start&end indices into `characters`. - self.tokens = [] - # List of UDWord instances. - self.words = [] - # List of UDSpan instances with start&end indices into `characters`. - self.sentences = [] - class UDSpan: - def __init__(self, start, end): - self.start = start - # Note that self.end marks the first position **after the end** of span, - # so we can use characters[start:end] or range(start, end). - self.end = end - class UDWord: - def __init__(self, span, columns, is_multiword): - # Span of this word (or MWT, see below) within ud_representation.characters. - self.span = span - # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,... - self.columns = columns - # is_multiword==True means that this word is part of a multi-word token. - # In that case, self.span marks the span of the whole multi-word token. - self.is_multiword = is_multiword - # Reference to the UDWord instance representing the HEAD (or None if root). - self.parent = None - # List of references to UDWord instances representing functional-deprel children. - self.functional_children = [] - # Only consider universal FEATS. - self.columns[FEATS] = "|".join(sorted(feat for feat in columns[FEATS].split("|") - if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) - # Let's ignore language-specific deprel subtypes. - self.columns[DEPREL] = columns[DEPREL].split(":")[0] - # Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS - self.is_content_deprel = self.columns[DEPREL] in CONTENT_DEPRELS - self.is_functional_deprel = self.columns[DEPREL] in FUNCTIONAL_DEPRELS - # store enhanced deps --GB - # split string positions and enhanced labels as well? - self.columns[DEPS] = process_enhanced_deps(columns[DEPS]) - - ud = UDRepresentation() - - # Load the CoNLL-U file - index, sentence_start = 0, None - - modified_deprels = 0 - - while True: - line = file.readline() - if not line: - break - line = _decode(line.rstrip("\r\n")) - - # Handle sentence start boundaries - if sentence_start is None: - # Skip comments - if line.startswith("#"): - continue - # Start a new sentence - ud.sentences.append(UDSpan(index, 0)) - sentence_start = len(ud.words) - if not line: - # Add parent and children UDWord links and check there are no cycles - def process_word(word): - if word.parent == "remapping": - raise UDError("There is a cycle in a sentence") - if word.parent is None: - head = int(word.columns[HEAD]) - if head < 0 or head > len(ud.words) - sentence_start: - raise UDError("HEAD '{}' points outside of the sentence".format(_encode(word.columns[HEAD]))) - if head: - parent = ud.words[sentence_start + head - 1] - word.parent = "remapping" - process_word(parent) - word.parent = parent - - - position = sentence_start # need to incrementally keep track of current position for loop detection in relcl - for word in ud.words[sentence_start:]: - process_word(word) - enhanced_deps = word.columns[DEPS] - # replace head positions of enhanced dependencies with parent word object -- GB - processed_deps = [] - for (head,steps) in word.columns[DEPS] : # (3,['conj:en','obj:voor']) - hd = int(head) - parent = ud.words[sentence_start + hd -1] if hd else hd # just assign '0' to parent for root cases - processed_deps.append((parent,steps)) - enhanced_deps = processed_deps - - # make the evaluation script ignore various types of enhancements -- GB - - # ignore rel>rel dependencies, and instead append the original hd/rel edge - # note that this also ignores other extensions (like adding lemma's) - # note that this sometimes introduces duplicates (if orig hd/rel was already included in DEPS) - if (treebank_type['no_gapping']) : # enhancement 1 - processed_deps = [] - for (parent,steps) in enhanced_deps : - if len(steps) > 1 : - #print("replaced {} by {}".format(steps,word.columns[DEPREL])) - (parent,steps) = (word.parent,[word.columns[DEPREL]]) - modified_deprels += 1 - if not((parent,steps) in processed_deps) : - processed_deps.append((parent,steps)) - enhanced_deps = processed_deps - - # for a given conj node, any rel other than conj in DEPS can be ignored - if treebank_type['no_shared_parents_in_coordination'] : # enhancement 2 - for (parent,steps) in enhanced_deps : - if len(steps) == 1 and steps[0].startswith('conj') : - enhanced_deps = [(parent,steps)] - modified_deprels += 1 - - # duplicate deprels not matching ud_hd/ud_dep are spurious. - # czech/pud estonian/ewt syntagrus finnish/pud - # NB: treebanks that do not mark xcomp and relcl subjects: we now preserve duplicate nsubj if parent is xcomp - # but in: the man who walked and talked, we now also preserve nsubj 2x for 'who' - # idem in I know that she walked and talked - if treebank_type['no_shared_dependents_in_coordination'] : # enhancement 3 - processed_deps = [] - for (parent,steps) in enhanced_deps : - duplicate = 0 - ud_hd = word.parent - for (p2,s2) in enhanced_deps : - if steps == s2 and p2 == ud_hd and parent != p2 : - if not (p2.columns[DEPREL] in ('xcomp','acl','acl:relcl') and steps == ['nsubj']) : - duplicate = 1 - modified_deprels += 1 - if not(duplicate) : - processed_deps.append((parent,steps)) - enhanced_deps = processed_deps - - # if treebank does not have control relations: subjects of xcomp parents in system are to be skipped - # note that rel is actually a path sometimes rel1>rel2 in theory rel2 could be subj? - # from lassy-small: 7:conj:en>nsubj:pass|7:conj:en>nsubj:xsubj (7,['conj:en','nsubj:xsubj']) - if (treebank_type['no_control']) : # enhancement 4 - processed_deps = [] - for (parent,steps) in enhanced_deps : - include = 1 - if ( parent and parent.columns[DEPREL] == 'xcomp') : - for rel in steps: - if rel.startswith('nsubj') : - include = 0 - modified_deprels += 1 - if include : - processed_deps.append((parent,steps)) - enhanced_deps = processed_deps - - if (treebank_type['no_external_arguments_of_relative_clauses']) : # enhancement 5 - processed_deps = [] - for (parent,steps) in enhanced_deps : - if (steps[0] == 'ref') : - processed_deps.append((word.parent,[word.columns[DEPREL]])) # append the original relation - modified_deprels += 1 - # ignore external argument link - # external args are deps of an acl:relcl where that acl also is a dependent of external arg (i.e. ext arg introduces a cycle) - elif ( parent and parent.columns[DEPREL].startswith('acl') and int(parent.columns[HEAD]) == position - sentence_start ) : - #print('removed external argument') - modified_deprels += 1 - else : - processed_deps.append((parent,steps)) - enhanced_deps = processed_deps - - # treebanks where no lemma info has been added - if treebank_type['no_case_info'] : # enhancement number 6 - processed_deps = [] - for (hd,steps) in enhanced_deps : - processed_steps = [] - for dep in steps : - depparts = dep.split(':') - if depparts[0] in CASE_DEPRELS : - if (len(depparts) == 2 and not(depparts[1] in UNIVERSAL_DEPREL_EXTENSIONS )) : - dep = depparts[0] - modified_deprels += 1 - processed_steps.append(dep) - processed_deps.append((hd,processed_steps)) - enhanced_deps = processed_deps - - position += 1 - word.columns[DEPS] = enhanced_deps - - - # func_children cannot be assigned within process_word - # because it is called recursively and may result in adding one child twice. - for word in ud.words[sentence_start:]: - if word.parent and word.is_functional_deprel: - word.parent.functional_children.append(word) - - # Check there is a single root node - if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1: - raise UDError("There are multiple roots in a sentence") - - # End the sentence - ud.sentences[-1].end = index - sentence_start = None - continue - - # Read next token/word - columns = line.split("\t") - if len(columns) != 10: - raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(line))) - - # Skip empty nodes - # After collapsing empty nodes into the enhancements, these should not occur --GB - if "." in columns[ID]: - raise UDError("The collapsed CoNLL-U line still contains empty nodes: {}".format(_encode(line))) - - # Delete spaces from FORM, so gold.characters == system.characters - # even if one of them tokenizes the space. Use any Unicode character - # with category Zs. - columns[FORM] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[FORM])) - if not columns[FORM]: - raise UDError("There is an empty FORM in the CoNLL-U file") - - # Save token - ud.characters.extend(columns[FORM]) - ud.tokens.append(UDSpan(index, index + len(columns[FORM]))) - index += len(columns[FORM]) - - # Handle multi-word tokens to save word(s) - if "-" in columns[ID]: - try: - start, end = map(int, columns[ID].split("-")) - except: - raise UDError("Cannot parse multi-word token ID '{}'".format(_encode(columns[ID]))) - - for _ in range(start, end + 1): - word_line = _decode(file.readline().rstrip("\r\n")) - word_columns = word_line.split("\t") - if len(word_columns) != 10: - raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(word_line))) - ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) - - # Basic tokens/words - else: - try: - word_id = int(columns[ID]) - except: - raise UDError("Cannot parse word ID '{}'".format(_encode(columns[ID]))) - if word_id != len(ud.words) - sentence_start + 1: - raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format( - _encode(columns[ID]), _encode(columns[FORM]), len(ud.words) - sentence_start + 1)) - - try: - head_id = int(columns[HEAD]) - except: - raise UDError("Cannot parse HEAD '{}'".format(_encode(columns[HEAD]))) - if head_id < 0: - raise UDError("HEAD cannot be negative") - - ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False)) - - if modified_deprels : - print('modified/deleted {} enhanced DEPRELS in {}'.format(modified_deprels,file.name)) - - if sentence_start is not None: - raise UDError("The CoNLL-U file does not end with empty line") - - return ud - -# Evaluate the gold and system treebanks (loaded using load_conllu). -def evaluate(gold_ud, system_ud): - class Score: - def __init__(self, gold_total, system_total, correct, aligned_total=None): - self.correct = correct - self.gold_total = gold_total - self.system_total = system_total - self.aligned_total = aligned_total - self.precision = correct / system_total if system_total else 0.0 - self.recall = correct / gold_total if gold_total else 0.0 - self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0 - self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total - class AlignmentWord: - def __init__(self, gold_word, system_word): - self.gold_word = gold_word - self.system_word = system_word - class Alignment: - def __init__(self, gold_words, system_words): - self.gold_words = gold_words - self.system_words = system_words - self.matched_words = [] - self.matched_words_map = {} - def append_aligned_words(self, gold_word, system_word): - self.matched_words.append(AlignmentWord(gold_word, system_word)) - self.matched_words_map[system_word] = gold_word - - def spans_score(gold_spans, system_spans): - correct, gi, si = 0, 0, 0 - while gi < len(gold_spans) and si < len(system_spans): - if system_spans[si].start < gold_spans[gi].start: - si += 1 - elif gold_spans[gi].start < system_spans[si].start: - gi += 1 - else: - correct += gold_spans[gi].end == system_spans[si].end - si += 1 - gi += 1 - - return Score(len(gold_spans), len(system_spans), correct) - - def alignment_score(alignment, key_fn=None, filter_fn=None): - if filter_fn is not None: - gold = sum(1 for gold in alignment.gold_words if filter_fn(gold)) - system = sum(1 for system in alignment.system_words if filter_fn(system)) - aligned = sum(1 for word in alignment.matched_words if filter_fn(word.gold_word)) - else: - gold = len(alignment.gold_words) - system = len(alignment.system_words) - aligned = len(alignment.matched_words) - - if key_fn is None: - # Return score for whole aligned words - return Score(gold, system, aligned) - - def gold_aligned_gold(word): - return word - def gold_aligned_system(word): - return alignment.matched_words_map.get(word, "NotAligned") if word is not None else None - correct = 0 - for words in alignment.matched_words: - if filter_fn is None or filter_fn(words.gold_word): - if key_fn(words.gold_word, gold_aligned_gold) == key_fn(words.system_word, gold_aligned_system): - correct += 1 - - return Score(gold, system, correct, aligned) - - def enhanced_alignment_score(alignment): - # count all matching enhanced deprels in gold, system GB - # gold and system = sum of gold and predicted deps - # parents are pointers to word object, make sure to compare system parent with aligned word in gold in cases where - # tokenization introduces mismatches in number of words per sentence. - gold = 0 - for gold_word in alignment.gold_words : - gold += len(gold_word.columns[DEPS]) - system = 0 - for system_word in alignment.system_words : - system += len(system_word.columns[DEPS]) - # NB aligned does not play a role in computing f1 score -- GB - aligned = len(alignment.matched_words) - correct = 0 - for words in alignment.matched_words: - gold_deps = words.gold_word.columns[DEPS] - system_deps = words.system_word.columns[DEPS] - for (parent,dep) in gold_deps : - for (sparent,sdep) in system_deps : - if dep == sdep : - if parent == alignment.matched_words_map.get(sparent,"NotAligned") : - correct += 1 - elif (parent == 0 and sparent == 0) : # cases where parent is root - correct += 1 - - return Score(gold, system, correct, aligned) - - - def beyond_end(words, i, multiword_span_end): - if i >= len(words): - return True - if words[i].is_multiword: - return words[i].span.start >= multiword_span_end - return words[i].span.end > multiword_span_end - - def extend_end(word, multiword_span_end): - if word.is_multiword and word.span.end > multiword_span_end: - return word.span.end - return multiword_span_end - - def find_multiword_span(gold_words, system_words, gi, si): - # We know gold_words[gi].is_multiword or system_words[si].is_multiword. - # Find the start of the multiword span (gs, ss), so the multiword span is minimal. - # Initialize multiword_span_end characters index. - if gold_words[gi].is_multiword: - multiword_span_end = gold_words[gi].span.end - if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start: - si += 1 - else: # if system_words[si].is_multiword - multiword_span_end = system_words[si].span.end - if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start: - gi += 1 - gs, ss = gi, si - - # Find the end of the multiword span - # (so both gi and si are pointing to the word following the multiword span end). - while not beyond_end(gold_words, gi, multiword_span_end) or \ - not beyond_end(system_words, si, multiword_span_end): - if gi < len(gold_words) and (si >= len(system_words) or - gold_words[gi].span.start <= system_words[si].span.start): - multiword_span_end = extend_end(gold_words[gi], multiword_span_end) - gi += 1 - else: - multiword_span_end = extend_end(system_words[si], multiword_span_end) - si += 1 - return gs, ss, gi, si - - def compute_lcs(gold_words, system_words, gi, si, gs, ss): - lcs = [[0] * (si - ss) for i in range(gi - gs)] - for g in reversed(range(gi - gs)): - for s in reversed(range(si - ss)): - if gold_words[gs + g].columns[FORM].lower() == system_words[ss + s].columns[FORM].lower(): - lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0) - lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0) - lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0) - return lcs - - def align_words(gold_words, system_words): - alignment = Alignment(gold_words, system_words) - - gi, si = 0, 0 - while gi < len(gold_words) and si < len(system_words): - if gold_words[gi].is_multiword or system_words[si].is_multiword: - # A: Multi-word tokens => align via LCS within the whole "multiword span". - gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si) - - if si > ss and gi > gs: - lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss) - - # Store aligned words - s, g = 0, 0 - while g < gi - gs and s < si - ss: - if gold_words[gs + g].columns[FORM].lower() == system_words[ss + s].columns[FORM].lower(): - alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s]) - g += 1 - s += 1 - elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0): - g += 1 - else: - s += 1 - else: - # B: No multi-word token => align according to spans. - if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end): - alignment.append_aligned_words(gold_words[gi], system_words[si]) - gi += 1 - si += 1 - elif gold_words[gi].span.start <= system_words[si].span.start: - gi += 1 - else: - si += 1 - - return alignment - - # Check that the underlying character sequences do match. - if gold_ud.characters != system_ud.characters: - index = 0 - while index < len(gold_ud.characters) and index < len(system_ud.characters) and \ - gold_ud.characters[index] == system_ud.characters[index]: - index += 1 - - raise UDError( - "The concatenation of tokens in gold file and in system file differ!\n" + - "First 20 differing characters in gold file: '{}' and system file: '{}'".format( - "".join(map(_encode, gold_ud.characters[index:index + 20])), - "".join(map(_encode, system_ud.characters[index:index + 20])) - ) - ) - - # Align words - alignment = align_words(gold_ud.words, system_ud.words) - - # Compute the F1-scores - return { - "Tokens": spans_score(gold_ud.tokens, system_ud.tokens), - "Sentences": spans_score(gold_ud.sentences, system_ud.sentences), - "Words": alignment_score(alignment), - "UPOS": alignment_score(alignment, lambda w, _: w.columns[UPOS]), - "XPOS": alignment_score(alignment, lambda w, _: w.columns[XPOS]), - "UFeats": alignment_score(alignment, lambda w, _: w.columns[FEATS]), - "AllTags": alignment_score(alignment, lambda w, _: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])), - "Lemmas": alignment_score(alignment, lambda w, ga: w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), - "UAS": alignment_score(alignment, lambda w, ga: ga(w.parent)), - "LAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL])), - # include enhanced DEPS score -- GB - "ELAS": enhanced_alignment_score(alignment), - "CLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL]), - filter_fn=lambda w: w.is_content_deprel), - "MLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], w.columns[UPOS], w.columns[FEATS], - [(ga(c), c.columns[DEPREL], c.columns[UPOS], c.columns[FEATS]) - for c in w.functional_children]), - filter_fn=lambda w: w.is_content_deprel), - "BLEX": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], - w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), - filter_fn=lambda w: w.is_content_deprel), - } - - -def load_conllu_file(path,treebank_type): - _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {})) - return load_conllu(_file,treebank_type) - -def evaluate_wrapper(args): - treebank_type = {} - enhancements = list(args.enhancements) - treebank_type['no_gapping'] = 1 if '1' in enhancements else 0 - treebank_type['no_shared_parents_in_coordination'] = 1 if '2' in enhancements else 0 - treebank_type['no_shared_dependents_in_coordination'] = 1 if '3' in enhancements else 0 - treebank_type['no_control'] = 1 if '4' in enhancements else 0 - treebank_type['no_external_arguments_of_relative_clauses'] = 1 if '5' in enhancements else 0 - treebank_type['no_case_info'] = 1 if '6' in enhancements else 0 - for key in treebank_type : - if treebank_type[key] : - print('evaluating with {} enhancements setting'.format(key)) - - # Load CoNLL-U files - gold_ud = load_conllu_file(args.gold_file,treebank_type) - system_ud = load_conllu_file(args.system_file,treebank_type) - return evaluate(gold_ud, system_ud) - -def main(): - # Parse arguments - parser = argparse.ArgumentParser() - parser.add_argument("gold_file", type=str, - help="Name of the CoNLL-U file with the gold data.") - parser.add_argument("system_file", type=str, - help="Name of the CoNLL-U file with the predicted data.") - parser.add_argument("--verbose", "-v", default=False, action="store_true", - help="Print all metrics.") - parser.add_argument("--counts", "-c", default=False, action="store_true", - help="Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.") - parser.add_argument("--enhancements", type=str, default='0', - help="Level of enhancements in the gold data (see guidelines) 0=all (default), 1=no gapping, 2=no shared parents, 3=no shared dependents 4=no control, 5=no external arguments, 6=no lemma info, 12=both 1 and 2 apply, etc.") - args = parser.parse_args() - - # Evaluate - evaluation = evaluate_wrapper(args) - - # Print the evaluation - if not args.verbose and not args.counts: - print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1)) - print("ELAS F1 Score: {:.2f}".format(100 * evaluation["ELAS"].f1)) - - print("MLAS Score: {:.2f}".format(100 * evaluation["MLAS"].f1)) - print("BLEX Score: {:.2f}".format(100 * evaluation["BLEX"].f1)) - else: - if args.counts: - print("Metric | Correct | Gold | Predicted | Aligned") - else: - print("Metric | Precision | Recall | F1 Score | AligndAcc") - print("-----------+-----------+-----------+-----------+-----------") - for metric in["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "ELAS", "CLAS", "MLAS", "BLEX"]: - if args.counts: - print("{:11}|{:10} |{:10} |{:10} |{:10}".format( - metric, - evaluation[metric].correct, - evaluation[metric].gold_total, - evaluation[metric].system_total, - evaluation[metric].aligned_total or (evaluation[metric].correct if metric == "Words" else "") - )) - else: - print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( - metric, - 100 * evaluation[metric].precision, - 100 * evaluation[metric].recall, - 100 * evaluation[metric].f1, - "{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else "" - )) - -if __name__ == "__main__": - main() - -# Tests, which can be executed with `python -m unittest conll18_ud_eval`. -class TestAlignment(unittest.TestCase): - @staticmethod - def _load_words(words): - """Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors. - - Args: - words: - - Returns: - - """ - lines, num_words = [], 0 - for w in words: - parts = w.split(" ") - if len(parts) == 1: - num_words += 1 - lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1))) - else: - lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0])) - for part in parts[1:]: - num_words += 1 - lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1))) - return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"]))) - - def _test_exception(self, gold, system): - self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system)) - - def _test_ok(self, gold, system, correct): - metrics = evaluate(self._load_words(gold), self._load_words(system)) - gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold)) - system_words = sum((max(1, len(word.split(" ")) - 1) for word in system)) - self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1), - (correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words))) - - def test_exception(self): - self._test_exception(["a"], ["b"]) - - def test_equal(self): - self._test_ok(["a"], ["a"], 1) - self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3) - - def test_equal_with_multiword(self): - self._test_ok(["abc a b c"], ["a", "b", "c"], 3) - self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4) - self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4) - self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5) - - def test_alignment(self): - self._test_ok(["abcd"], ["a", "b", "c", "d"], 0) - self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1) - self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2) - self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2) - self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4) - self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2) - self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1)