#!/usr/bin/env python3
import sys
import configargparse
import csv
import os
import re
from glob import glob
from tqdm import tqdm
from ucca import ioutil, layer1
from ucca.convert import from_text, to_text, from_json, to_json
from ucca.normalization import normalize
from semstr.cfgutil import add_verbose_arg, add_boolean_option
from semstr.validation import validate, print_errors
description = """Parses files in the specified format, and writes as the specified format.
Each passage is written to the file: <outdir>/<prefix><passage_id>.<extension> """
[docs]def from_conll(lines, passage_id, return_original=False, dep=False, preprocess=True, **kwargs):
"""Converts from parsed text in CoNLL format to a Passage object.
:param lines: iterable of lines in CoNLL format, describing a single passage.
:param passage_id: ID to set for passage
:param return_original: return triple of (UCCA passage, CoNLL string, sentence ID)
:param dep: return dependency graph rather than converted UCCA passage
:param preprocess: preprocess the dependency graph before converting to UCCA (or returning it)?
:return generator of Passage objects
"""
from semstr.conversion.conll import ConllConverter
return ConllConverter().from_format(lines, passage_id=passage_id, return_original=return_original, dep=dep,
preprocess=preprocess, format=kwargs.get("format"))
[docs]def to_conll(passage, test=False, tree=False, preprocess=True, **kwargs):
""" Convert from a Passage object to a string in CoNLL-X format (conll)
:param passage: the Passage object to convert
:param test: whether to omit the head and deprel columns. Defaults to False
:param tree: whether to omit rows for non-primary parents. Defaults to False
:param preprocess: preprocess the converted dependency graph before returning it?
:return list of lines representing the dependencies in the passage
"""
from semstr.conversion.conll import ConllConverter
return ConllConverter(tree=tree).to_format(passage, test, preprocess=preprocess, format=kwargs.get("format"))
[docs]def from_export(lines, passage_id=None, return_original=False, **kwargs):
"""Converts from parsed text in NeGra export format to a Passage object.
:param lines: iterable of lines in NeGra export format, describing a single passage.
:param passage_id: ID to set for passage, overriding the ID from the file
:param return_original: return triple of (UCCA passage, Export string, sentence ID)
:return generator of Passage objects
"""
from semstr.conversion.export import ExportConverter
return ExportConverter().from_format(lines, passage_id=passage_id, return_original=return_original,
format=kwargs.get("format"))
[docs]def to_export(passage, test=False, tree=False, **kwargs):
""" Convert from a Passage object to a string in NeGra export format (export)
:param passage: the Passage object to convert
:param test: whether to omit the edge and parent columns. Defaults to False
:param tree: whether to omit columns for non-primary parents. Defaults to False
:return list of lines representing a (discontinuous) tree structure constructed from the passage
"""
from semstr.conversion.export import ExportConverter
return ExportConverter().to_format(passage, test=test, tree=tree, format=kwargs.get("format"))
[docs]def from_amr(lines, passage_id=None, return_original=False, save_original=True, wikification=False, placeholders=True,
**kwargs):
"""Converts from parsed text in AMR PENMAN format to a Passage object.
:param lines: iterable of lines in AMR PENMAN format, describing a single passage.
:param passage_id: ID to set for passage, overriding the ID from the file
:param save_original: whether to save original AMR text in passage.extra
:param return_original: return triple of (UCCA passage, AMR string, AMR ID)
:param wikification: whether to use wikification for replacing node labels with placeholders based on tokens
:param placeholders: introduce placeholders into node labels when they include the terminal's text?
:return generator of Passage objects
"""
from semstr.conversion.amr import AmrConverter
return AmrConverter().from_format(lines, passage_id=passage_id, return_original=return_original,
save_original=save_original, wikification=wikification, placeholders=placeholders,
format=kwargs.get("format"))
[docs]def to_amr(passage, metadata=True, wikification=True, use_original=True, verbose=False, default_label=None,
**kwargs):
""" Convert from a Passage object to a string in AMR PENMAN format (export)
:param passage: the Passage object to convert
:param metadata: whether to print ::id and ::tok lines
:param wikification: whether to wikify named concepts, adding a :wiki triple
:param use_original: whether to use original AMR text from passage.extra
:param verbose: whether to print extra information
:param default_label: label to use in case node has no label attribute
:return list of lines representing an AMR in PENMAN format, constructed from the passage
"""
from semstr.conversion.amr import AmrConverter
return AmrConverter().to_format(passage, metadata, wikification, verbose, use_original=use_original,
default_label=default_label, format=kwargs.get("format"))
[docs]def from_conllu(lines, passage_id=None, return_original=False, annotate=False, terminals_only=False, dep=False,
enhanced=True, preprocess=True, **kwargs):
"""Converts from parsed text in Universal Dependencies format to a Passage object.
:param lines: iterable of lines in Universal Dependencies format, describing a single passage.
:param passage_id: ID to set for passage
:param return_original: return triple of (UCCA passage, Universal Dependencies string, sentence ID)
:param annotate: whether to save dependency annotations in "extra" dict of layer 0
:param terminals_only: create only terminals (with any annotation if specified), no non-terminals
:param dep: return dependency graph rather than converted UCCA passage
:param enhanced: whether to include enhanced edges
:param preprocess: preprocess the dependency graph before converting to UCCA (or returning it)?
:return generator of Passage objects
"""
from semstr.conversion.conllu import ConlluConverter
return ConlluConverter(enhanced=enhanced).from_format(lines, passage_id=passage_id, return_original=return_original,
annotate=annotate, terminals_only=terminals_only, dep=dep,
preprocess=preprocess, format=kwargs.get("format"))
[docs]def to_conllu(passage, test=False, enhanced=True, preprocess=True, **kwargs):
""" Convert from a Passage object to a string in Universal Dependencies format (conllu)
:param passage: the Passage object to convert
:param test: whether to omit the head and deprel columns. Defaults to False
:param enhanced: whether to include enhanced edges
:param preprocess: preprocess the converted dependency graph before returning it?
:return list of lines representing the semantic dependencies in the passage
"""
from semstr.conversion.conllu import ConlluConverter
return ConlluConverter(enhanced=enhanced).to_format(passage, test=test, preprocess=preprocess,
format=kwargs.get("format"))
[docs]def from_sdp(lines, passage_id, mark_aux=False, return_original=False, dep=False, preprocess=True, **kwargs):
"""Converts from parsed text in SemEval 2015 SDP format to a Passage object.
:param lines: iterable of lines in SDP format, describing a single passage.
:param passage_id: ID to set for passage
:param mark_aux: add a preceding # for labels of auxiliary edges added
:param return_original: return triple of (UCCA passage, SDP string, sentence ID)
:param dep: return dependency graph rather than converted UCCA passage
:param preprocess: preprocess the dependency graph before converting to UCCA (or returning it)?
:return generator of Passage objects
"""
from semstr.conversion.sdp import SdpConverter
return SdpConverter(mark_aux=mark_aux).from_format(lines, passage_id=passage_id, return_original=return_original,
dep=dep, preprocess=preprocess, format=kwargs.get("format"))
[docs]def to_sdp(passage, test=False, tree=False, mark_aux=False, preprocess=True, **kwargs):
""" Convert from a Passage object to a string in SemEval 2015 SDP format (sdp)
:param passage: the Passage object to convert
:param test: whether to omit the top, head, frame, etc. columns. Defaults to False
:param tree: whether to omit columns for non-primary parents. Defaults to False
:param mark_aux: omit edges with labels with a preceding #
:param preprocess: preprocess the converted dependency graph before returning it?
:return list of lines representing the semantic dependencies in the passage
"""
from semstr.conversion.sdp import SdpConverter
return SdpConverter(mark_aux=mark_aux, tree=tree).to_format(passage, test=test, preprocess=preprocess,
format=kwargs.get("format"))
CONVERTERS = {
None: (None, None),
"json": (from_json, to_json),
"conll": (from_conll, to_conll),
"conllu": (from_conllu, to_conllu),
"sdp": (from_sdp, to_sdp),
"export": (from_export, to_export),
"amr": (from_amr, to_amr),
"txt": (from_text, to_text),
}
FROM_FORMAT = {f: c[0] for f, c in CONVERTERS.items() if c[0] is not None}
TO_FORMAT = {f: c[1] for f, c in CONVERTERS.items() if c[1] is not None}
UCCA_EXT = (".xml", ".pickle")
[docs]def iter_files(patterns):
for pattern in patterns:
filenames = sorted(glob(pattern))
if not filenames:
raise IOError("Not found: " + pattern)
yield from filenames
[docs]def iter_passages(patterns, desc=None, input_format=None, prefix="", label_map=None, output_format=None, **kwargs):
t = tqdm(list(iter_files(patterns)), unit="file", desc=desc)
for filename in t:
t.set_postfix(file=os.path.basename(filename))
if not os.path.isfile(filename):
raise IOError("Not a file: %s" % filename)
no_ext, ext = os.path.splitext(filename)
if ext in UCCA_EXT: # UCCA input
yield ioutil.file2passage(filename)
else:
basename = os.path.basename(no_ext)
try:
passage_id = re.search(r"\d+(\.\d+)*", basename).group(0)
except AttributeError:
passage_id = basename
converter = FROM_FORMAT.get(input_format or ext.lstrip("."), (from_text,))
with open(filename, encoding="utf-8") as f:
yield from converter(f, prefix + passage_id, format=output_format if label_map else None, **kwargs)
[docs]def map_labels(passage, label_map_file):
if label_map_file:
with open(label_map_file, encoding="utf-8") as f:
label_map = dict(csv.reader(f))
for node in passage.layer(layer1.LAYER_ID).all:
for edge in list(node):
mapped = label_map.get(edge.tag) or label_map.get(edge.tag.partition(":")[0])
if mapped is None:
if edge.attrib.get("remote"):
node.remove(edge)
else:
edge.tag = mapped
try:
del passage.extra["format"] # Remove original format as it no longer applies, after labels were replaced
except KeyError:
pass
[docs]def write_passage(passage, out_dir=".", output_format=None, binary=False, verbose=False, label_map=False, split=False,
join=None, **kwargs):
ext = {None: UCCA_EXT[binary], "amr": ".txt"}.get(output_format) or "." + output_format
if join and join.endswith(ext):
ext = ""
outfile = os.path.join(out_dir, (join or passage.ID) + ext)
if verbose:
with ioutil.external_write_mode():
print("Writing '%s'..." % outfile, file=sys.stderr)
if output_format is None: # UCCA output
ioutil.passage2file(passage, outfile, binary=binary)
else:
converter = TO_FORMAT[output_format]
with open(outfile, "a" if join else "w", encoding="utf-8") as f:
for line in converter(passage, format=output_format if label_map else None, sentences=split, **kwargs):
print(line, file=f)
[docs]def main(args):
os.makedirs(args.out_dir, exist_ok=True)
kwargs = vars(args)
for passage in iter_passages(args.filenames, desc="Converting", **kwargs):
map_labels(passage, args.label_map)
if args.normalize and args.output_format != "txt":
normalize(passage, extra=args.extra_normalization)
if args.lang:
passage.attrib["lang"] = args.lang
write_passage(passage, **kwargs)
if args.validate:
try:
errors = list(validate(passage, **kwargs))
except ValueError:
continue
if errors:
print_errors(errors, passage.ID)
sys.exit(1)
[docs]def add_convert_args(p):
add_boolean_option(p, "test", "omit prediction columns (head and deprel for conll; top, pred, frame, etc. for sdp)",
short="t")
add_boolean_option(p, "tree", "remove multiple parents to get a tree", short="T")
add_boolean_option(p, "split", "split each sentence to its own passage", short="s")
add_boolean_option(p, "mark-aux", "mark auxiliary edges introduced/omit edges", short="m")
p.add_argument("--label-map", help="CSV file specifying mapping of input edge labels to output edge labels")
if __name__ == '__main__':
argparser = configargparse.ArgParser(description=description)
argparser.add_argument("filenames", nargs="+", help="file names to convert")
argparser.add_argument("-i", "--input-format", choices=CONVERTERS, help="input file format (detected by extension)")
argparser.add_argument("-f", "--output-format", choices=CONVERTERS, help="output file format (default: UCCA)")
argparser.add_argument("-o", "--out-dir", default=".", help="output directory")
argparser.add_argument("-j", "--join", help="concatenate all output files to a file with this name")
argparser.add_argument("-p", "--prefix", default="", help="output passage ID prefix")
add_boolean_option(argparser, "binary", "write in binary format (.%s)" % UCCA_EXT[1], short="b")
add_boolean_option(argparser, "annotate", "store dependency annotations in 'extra' dict", short="a")
add_boolean_option(argparser, "validate", "validate every passage after conversion", short="V")
add_boolean_option(argparser, "ucca-validation", "apply UCCA-specific validations", short="u")
add_boolean_option(argparser, "enhanced", "read enhanced dependencies", default=True)
add_boolean_option(argparser, "wikification", "AMR wikification", default=True)
argparser.add_argument("--default-label", help="use this for missing AMR labels, otherwise raise exception")
add_boolean_option(argparser, "normalize", "normalize passage", default=True)
add_boolean_option(argparser, "extra-normalization", "more normalization rules")
argparser.add_argument("-l", "--lang", help="small two-letter language code to set in output passage metadata")
add_convert_args(argparser)
add_verbose_arg(argparser, help="detailed output")
main(argparser.parse_args())
sys.exit(0)