Source code for semstr.convert

#!/usr/bin/env python3

import sys

import configargparse
import csv
import os
import re
from glob import glob
from tqdm import tqdm
from ucca import ioutil, layer1
from ucca.convert import from_text, to_text, from_json, to_json
from ucca.normalization import normalize

from semstr.cfgutil import add_verbose_arg, add_boolean_option
from semstr.validation import validate, print_errors

description = """Parses files in the specified format, and writes as the specified format.
Each passage is written to the file: <outdir>/<prefix><passage_id>.<extension> """


[docs]def from_conll(lines, passage_id, return_original=False, dep=False, preprocess=True, **kwargs):
    """Converts from parsed text in CoNLL format to a Passage object.

    :param lines: iterable of lines in CoNLL format, describing a single passage.
    :param passage_id: ID to set for passage
    :param return_original: return triple of (UCCA passage, CoNLL string, sentence ID)
    :param dep: return dependency graph rather than converted UCCA passage
    :param preprocess: preprocess the dependency graph before converting to UCCA (or returning it)?

    :return generator of Passage objects
    """
    from semstr.conversion.conll import ConllConverter
    return ConllConverter().from_format(lines, passage_id=passage_id, return_original=return_original, dep=dep,
                                        preprocess=preprocess, format=kwargs.get("format"))


[docs]def to_conll(passage, test=False, tree=False, preprocess=True, **kwargs):
    """ Convert from a Passage object to a string in CoNLL-X format (conll)

    :param passage: the Passage object to convert
    :param test: whether to omit the head and deprel columns. Defaults to False
    :param tree: whether to omit rows for non-primary parents. Defaults to False
    :param preprocess: preprocess the converted dependency graph before returning it?

    :return list of lines representing the dependencies in the passage
    """
    from semstr.conversion.conll import ConllConverter
    return ConllConverter(tree=tree).to_format(passage, test, preprocess=preprocess, format=kwargs.get("format"))


[docs]def from_export(lines, passage_id=None, return_original=False, **kwargs):
    """Converts from parsed text in NeGra export format to a Passage object.

    :param lines: iterable of lines in NeGra export format, describing a single passage.
    :param passage_id: ID to set for passage, overriding the ID from the file
    :param return_original: return triple of (UCCA passage, Export string, sentence ID)

    :return generator of Passage objects
    """
    from semstr.conversion.export import ExportConverter
    return ExportConverter().from_format(lines, passage_id=passage_id, return_original=return_original,
                                         format=kwargs.get("format"))


[docs]def to_export(passage, test=False, tree=False, **kwargs):
    """ Convert from a Passage object to a string in NeGra export format (export)

    :param passage: the Passage object to convert
    :param test: whether to omit the edge and parent columns. Defaults to False
    :param tree: whether to omit columns for non-primary parents. Defaults to False

    :return list of lines representing a (discontinuous) tree structure constructed from the passage
    """
    from semstr.conversion.export import ExportConverter
    return ExportConverter().to_format(passage, test=test, tree=tree, format=kwargs.get("format"))


[docs]def from_amr(lines, passage_id=None, return_original=False, save_original=True, wikification=False, placeholders=True,
             **kwargs):
    """Converts from parsed text in AMR PENMAN format to a Passage object.

    :param lines: iterable of lines in AMR PENMAN format, describing a single passage.
    :param passage_id: ID to set for passage, overriding the ID from the file
    :param save_original: whether to save original AMR text in passage.extra
    :param return_original: return triple of (UCCA passage, AMR string, AMR ID)
    :param wikification: whether to use wikification for replacing node labels with placeholders based on tokens
    :param placeholders: introduce placeholders into node labels when they include the terminal's text?

    :return generator of Passage objects
    """
    from semstr.conversion.amr import AmrConverter
    return AmrConverter().from_format(lines, passage_id=passage_id, return_original=return_original,
                                      save_original=save_original, wikification=wikification, placeholders=placeholders,
                                      format=kwargs.get("format"))


[docs]def to_amr(passage, metadata=True, wikification=True, use_original=True, verbose=False, default_label=None,
           **kwargs):
    """ Convert from a Passage object to a string in AMR PENMAN format (export)

    :param passage: the Passage object to convert
    :param metadata: whether to print ::id and ::tok lines
    :param wikification: whether to wikify named concepts, adding a :wiki triple
    :param use_original: whether to use original AMR text from passage.extra
    :param verbose: whether to print extra information
    :param default_label: label to use in case node has no label attribute

    :return list of lines representing an AMR in PENMAN format, constructed from the passage
    """
    from semstr.conversion.amr import AmrConverter
    return AmrConverter().to_format(passage, metadata, wikification, verbose, use_original=use_original,
                                    default_label=default_label, format=kwargs.get("format"))


[docs]def from_conllu(lines, passage_id=None, return_original=False, annotate=False, terminals_only=False, dep=False,
                enhanced=True, preprocess=True, **kwargs):
    """Converts from parsed text in Universal Dependencies format to a Passage object.

    :param lines: iterable of lines in Universal Dependencies format, describing a single passage.
    :param passage_id: ID to set for passage
    :param return_original: return triple of (UCCA passage, Universal Dependencies string, sentence ID)
    :param annotate: whether to save dependency annotations in "extra" dict of layer 0
    :param terminals_only: create only terminals (with any annotation if specified), no non-terminals
    :param dep: return dependency graph rather than converted UCCA passage
    :param enhanced: whether to include enhanced edges
    :param preprocess: preprocess the dependency graph before converting to UCCA (or returning it)?

    :return generator of Passage objects
    """
    from semstr.conversion.conllu import ConlluConverter
    return ConlluConverter(enhanced=enhanced).from_format(lines, passage_id=passage_id, return_original=return_original,
                                                          annotate=annotate, terminals_only=terminals_only, dep=dep,
                                                          preprocess=preprocess, format=kwargs.get("format"))


[docs]def to_conllu(passage, test=False, enhanced=True, preprocess=True, **kwargs):
    """ Convert from a Passage object to a string in Universal Dependencies format (conllu)

    :param passage: the Passage object to convert
    :param test: whether to omit the head and deprel columns. Defaults to False
    :param enhanced: whether to include enhanced edges
    :param preprocess: preprocess the converted dependency graph before returning it?

    :return list of lines representing the semantic dependencies in the passage
    """
    from semstr.conversion.conllu import ConlluConverter
    return ConlluConverter(enhanced=enhanced).to_format(passage, test=test, preprocess=preprocess,
                                                        format=kwargs.get("format"))


[docs]def from_sdp(lines, passage_id, mark_aux=False, return_original=False, dep=False, preprocess=True, **kwargs):
    """Converts from parsed text in SemEval 2015 SDP format to a Passage object.

    :param lines: iterable of lines in SDP format, describing a single passage.
    :param passage_id: ID to set for passage
    :param mark_aux: add a preceding # for labels of auxiliary edges added
    :param return_original: return triple of (UCCA passage, SDP string, sentence ID)
    :param dep: return dependency graph rather than converted UCCA passage
    :param preprocess: preprocess the dependency graph before converting to UCCA (or returning it)?

    :return generator of Passage objects
    """
    from semstr.conversion.sdp import SdpConverter
    return SdpConverter(mark_aux=mark_aux).from_format(lines, passage_id=passage_id, return_original=return_original,
                                                       dep=dep, preprocess=preprocess, format=kwargs.get("format"))


[docs]def to_sdp(passage, test=False, tree=False, mark_aux=False, preprocess=True, **kwargs):
    """ Convert from a Passage object to a string in SemEval 2015 SDP format (sdp)

    :param passage: the Passage object to convert
    :param test: whether to omit the top, head, frame, etc. columns. Defaults to False
    :param tree: whether to omit columns for non-primary parents. Defaults to False
    :param mark_aux: omit edges with labels with a preceding #
    :param preprocess: preprocess the converted dependency graph before returning it?

    :return list of lines representing the semantic dependencies in the passage
    """
    from semstr.conversion.sdp import SdpConverter
    return SdpConverter(mark_aux=mark_aux, tree=tree).to_format(passage, test=test, preprocess=preprocess,
                                                                format=kwargs.get("format"))


CONVERTERS = {
    None: (None, None),
    "json": (from_json, to_json),
    "conll": (from_conll, to_conll),
    "conllu": (from_conllu, to_conllu),
    "sdp": (from_sdp, to_sdp),
    "export": (from_export, to_export),
    "amr": (from_amr, to_amr),
    "txt": (from_text, to_text),
}
FROM_FORMAT = {f: c[0] for f, c in CONVERTERS.items() if c[0] is not None}
TO_FORMAT = {f: c[1] for f, c in CONVERTERS.items() if c[1] is not None}

UCCA_EXT = (".xml", ".pickle")


[docs]def iter_files(patterns):
    for pattern in patterns:
        filenames = sorted(glob(pattern))
        if not filenames:
            raise IOError("Not found: " + pattern)
        yield from filenames


[docs]def iter_passages(patterns, desc=None, input_format=None, prefix="", label_map=None, output_format=None, **kwargs):
    t = tqdm(list(iter_files(patterns)), unit="file", desc=desc)
    for filename in t:
        t.set_postfix(file=os.path.basename(filename))
        if not os.path.isfile(filename):
            raise IOError("Not a file: %s" % filename)
        no_ext, ext = os.path.splitext(filename)
        if ext in UCCA_EXT:  # UCCA input
            yield ioutil.file2passage(filename)
        else:
            basename = os.path.basename(no_ext)
            try:
                passage_id = re.search(r"\d+(\.\d+)*", basename).group(0)
            except AttributeError:
                passage_id = basename
            converter = FROM_FORMAT.get(input_format or ext.lstrip("."), (from_text,))
            with open(filename, encoding="utf-8") as f:
                yield from converter(f, prefix + passage_id, format=output_format if label_map else None, **kwargs)


[docs]def map_labels(passage, label_map_file):
    if label_map_file:
        with open(label_map_file, encoding="utf-8") as f:
            label_map = dict(csv.reader(f))
        for node in passage.layer(layer1.LAYER_ID).all:
            for edge in list(node):
                mapped = label_map.get(edge.tag) or label_map.get(edge.tag.partition(":")[0])
                if mapped is None:
                    if edge.attrib.get("remote"):
                        node.remove(edge)
                else:
                    edge.tag = mapped
        try:
            del passage.extra["format"]  # Remove original format as it no longer applies, after labels were replaced
        except KeyError:
            pass


[docs]def write_passage(passage, out_dir=".", output_format=None, binary=False, verbose=False, label_map=False, split=False,
                  join=None, **kwargs):
    ext = {None: UCCA_EXT[binary], "amr": ".txt"}.get(output_format) or "." + output_format
    if join and join.endswith(ext):
        ext = ""
    outfile = os.path.join(out_dir, (join or passage.ID) + ext)
    if verbose:
        with ioutil.external_write_mode():
            print("Writing '%s'..." % outfile, file=sys.stderr)
    if output_format is None:  # UCCA output
        ioutil.passage2file(passage, outfile, binary=binary)
    else:
        converter = TO_FORMAT[output_format]
        with open(outfile, "a" if join else "w", encoding="utf-8") as f:
            for line in converter(passage, format=output_format if label_map else None, sentences=split, **kwargs):
                print(line, file=f)


[docs]def main(args):
    os.makedirs(args.out_dir, exist_ok=True)
    kwargs = vars(args)
    for passage in iter_passages(args.filenames, desc="Converting", **kwargs):
        map_labels(passage, args.label_map)
        if args.normalize and args.output_format != "txt":
            normalize(passage, extra=args.extra_normalization)
        if args.lang:
            passage.attrib["lang"] = args.lang
        write_passage(passage, **kwargs)
        if args.validate:
            try:
                errors = list(validate(passage, **kwargs))
            except ValueError:
                continue
            if errors:
                print_errors(errors, passage.ID)
                sys.exit(1)


[docs]def add_convert_args(p):
    add_boolean_option(p, "test", "omit prediction columns (head and deprel for conll; top, pred, frame, etc. for sdp)",
                       short="t")
    add_boolean_option(p, "tree", "remove multiple parents to get a tree", short="T")
    add_boolean_option(p, "split", "split each sentence to its own passage", short="s")
    add_boolean_option(p, "mark-aux", "mark auxiliary edges introduced/omit edges", short="m")
    p.add_argument("--label-map", help="CSV file specifying mapping of input edge labels to output edge labels")


if __name__ == '__main__':
    argparser = configargparse.ArgParser(description=description)
    argparser.add_argument("filenames", nargs="+", help="file names to convert")
    argparser.add_argument("-i", "--input-format", choices=CONVERTERS, help="input file format (detected by extension)")
    argparser.add_argument("-f", "--output-format", choices=CONVERTERS, help="output file format (default: UCCA)")
    argparser.add_argument("-o", "--out-dir", default=".", help="output directory")
    argparser.add_argument("-j", "--join", help="concatenate all output files to a file with this name")
    argparser.add_argument("-p", "--prefix", default="", help="output passage ID prefix")
    add_boolean_option(argparser, "binary", "write in binary format (.%s)" % UCCA_EXT[1], short="b")
    add_boolean_option(argparser, "annotate", "store dependency annotations in 'extra' dict", short="a")
    add_boolean_option(argparser, "validate", "validate every passage after conversion", short="V")
    add_boolean_option(argparser, "ucca-validation", "apply UCCA-specific validations", short="u")
    add_boolean_option(argparser, "enhanced", "read enhanced dependencies", default=True)
    add_boolean_option(argparser, "wikification", "AMR wikification", default=True)
    argparser.add_argument("--default-label", help="use this for missing AMR labels, otherwise raise exception")
    add_boolean_option(argparser, "normalize", "normalize passage", default=True)
    add_boolean_option(argparser, "extra-normalization", "more normalization rules")
    argparser.add_argument("-l", "--lang", help="small two-letter language code to set in output passage metadata")
    add_convert_args(argparser)
    add_verbose_arg(argparser, help="detailed output")
    main(argparser.parse_args())
    sys.exit(0)
Source code for semstr.convert

SEMSTR

Navigation

Related Topics