The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

NAME

Ufal::MorphoDiTa - bindings to MorphoDiTa library http://ufal.mff.cuni.cz/morphodita.

SYNOPSIS

  use Ufal::MorphoDiTa;

  my $tagger_file = 'english-morphium-wsj-140407.tagger';
  my $tagger = Ufal::MorphoDiTa::Tagger::load($tagger_file) or die "Cannot load tagger from file '$tagger_file'\n";
  my $forms  = Ufal::MorphoDiTa::Forms->new(); $forms->push($_) for qw(How are you ?);
  my $lemmas = Ufal::MorphoDiTa::TaggedLemmas->new();

  $tagger->tag($forms, $lemmas);

  for my $i (0 .. $lemmas->size()-1) {
    my $lemma = $lemmas->get($i);
    printf "token = %s, lemma = %s, tag = %s\n", $forms->get($i), $lemma->{lemma}, $lemma->{tag};
  }

REQUIREMENTS

To compile the module, C++11 compiler is needed, either g++ 4.7 or newer, clang 3.2 or newer or Visual Studio 2015.

DESCRIPTION

Ufal::MorphoDiTa is a Perl binding to MorphoDiTa library http://ufal.mff.cuni.cz/morphodita.

The bindings is a straightforward conversion of the C++ bindings API. Vectors do not have native Perl interface, see Ufal::MorphoDiTa::Forms source for reference. Static methods and enumerations are available only through the module, not through object instance.

Wrapped C++ API

The C++ API being wrapped follows. For a API reference of the original C++ API, see L\<http://ufal.mff.cuni.cz/morphodita/api-reference\>.

  Helper Structures
  -----------------
  
    typedef vector<int> Indices;
  
    typedef vector<string> Forms;
  
    struct TaggedForm {
      string form;
      string tag;
    };
    typedef vector<TaggedForm> TaggedForms;
  
    struct TaggedLemma {
      string lemma;
      string tag;
    };
    typedef vector<TaggedLemma> TaggedLemmas;
    typedef vector<TaggedLemmas> Analyses;
  
    struct TaggedLemmaForms {
      string lemma;
      TaggedForms forms;
    };
    typedef vector<TaggedLemmaForms> TaggedLemmasForms;
  
    struct TokenRange {
      size_t start;
      size_t length;
    };
    typedef vector<TokenRange> TokenRanges;
  
    struct DerivatedLemma {
      std::string lemma;
    };
    typedef vector<DerivatedLemma> DerivatedLemmas;
  
  
  Main Classes
  ------------
  
    class Version {
     public:
      unsigned major;
      unsigned minor;
      unsigned patch;
      string prerelease;
  
      static Version current();
    };
  
    class Tokenizer {
     public:
      virtual void setText(const char* text);
      virtual bool nextSentence(Forms* forms, TokenRanges* tokens);
  
      static Tokenizer* newVerticalTokenizer();
      static Tokenizer* newCzechTokenizer();
      static Tokenizer* newEnglishTokenizer();
      static Tokenizer* newGenericTokenizer();
    };
  
    class TagsetConverter {
     public:
      static TagsetConverter* newIdentityConverter();
      static TagsetConverter* newPdtToConll2009Converter();
      static TagsetConverter* newStripLemmaCommentConverter(const Morpho& morpho);
      static TagsetConverter* newStripLemmaIdConverter(const Morpho& morpho);
  
      virtual void convert(TaggedLemma& lemma) const;
      virtual void convertAnalyzed(TaggedLemmas& lemmas) const;
      virtual void convertGenerated(TaggedLemmasForms& forms) const;
    };
  
    class Derivator {
     public:
      virtual bool parent(const char* lemma, DerivatedLemma& parent) const;
      virtual bool children(const char* lemma, DerivatedLemmas& children) const;
    };
  
    class DerivationFormatter {
     public:
      virtual string formatDerivation(const char* lemma) const;
      virtual void formatTaggedLemma(TaggedLemma& tagged_lemma, const TagsetConverter* converter = nullptr) const;
      virtual void formatTaggedLemmas(TaggedLemmas& tagged_lemma, const TagsetConverter* converter = nullptr) const;
  
      static DerivationFormatter* newNoneDerivationFormatter();
      static DerivationFormatter* newRootDerivationFormatter(const Derivator* derivator);
      static DerivationFormatter* newPathDerivationFormatter(const Derivator* derivator);
      static DerivationFormatter* newTreeDerivationFormatter(const Derivator* derivator);
      static DerivationFormatter* newDerivationFormatter(const char* name, const Derivator* derivator);
    };
  
    class Morpho {
     public:
      static Morpho* load(const char* fname);
  
      enum { NO_GUESSER = 0, GUESSER = 1, GUESSER_UNSPECIFIED = -1 };
  
      virtual int analyze(const char* form, int guesser, TaggedLemmas& lemmas) const;
      virtual int generate(const char* lemma, const char* tag_wildcard, int guesser, TaggedLemmasForms& forms) const;
      virtual string rawLemma(const char* lemma) const;
      virtual string lemmaId(const char* lemma) const;
      virtual string rawForm(const char* form) const;
  
      virtual Tokenizer* newTokenizer() const;
  
      virtual Derivator* getDerivator() const;
    };
  
    class Tagger {
     public:
      static Tagger* load(const char* fname);
  
      virtual const Morpho* getMorpho() const;
  
      virtual void tag(const Forms& forms, TaggedLemmas& tags, int guesser = Morpho::GUESSER_UNSPECIFIED) const;
  
      virtual void tagAnalyzed(const Forms& forms, const Analyses& analyses, Indices& tags) const;
  
      Tokenizer* newTokenizer() const;
    };

Examples

run_morpho_cli

Simple example performing morphological analysis and generation.

  use warnings;
  use strict;
  use open qw(:std :utf8);
  
  use Ufal::MorphoDiTa;
  
  @ARGV >= 1 or die "Usage: $0 dict_file\n";
  
  print STDERR "Loading dictionary: ";
  my $morpho = Ufal::MorphoDiTa::Morpho::load($ARGV[0]);
  $morpho or die "Cannot load dictionary from file '$ARGV[0]'\n";
  print STDERR "done\n";
  shift @ARGV;
  
  my $lemmas = Ufal::MorphoDiTa::TaggedLemmas->new();
  my $lemmas_forms = Ufal::MorphoDiTa::TaggedLemmasForms->new();
  while (<>) {
    chomp;
    my @tokens = split /\t/, $_, -1;
    if (@tokens == 1) { #Analyze
      my $result = $morpho->analyze($tokens[0], $Ufal::MorphoDiTa::Morpho::GUESSER, $lemmas);
      my $guesser = $result == $Ufal::MorphoDiTa::Morpho::GUESSER ? "Guesser " : "";
  
      for (my ($i, $size) = (0, $lemmas->size()); $i < $size; $i++) {
        my $lemma = $lemmas->get($i);
        printf "%sLemma: %s %s\n", $guesser, $lemma->{lemma}, $lemma->{tag};
      }
    } elsif (@tokens == 2) { #Generate
      my $result = $morpho->generate($tokens[0], $tokens[1], $Ufal::MorphoDiTa::Morpho::GUESSER, $lemmas_forms);
      my $guesser = $result == $Ufal::MorphoDiTa::Morpho::GUESSER ? "Guesser " : "";
  
      for (my $i = 0; $i < $lemmas_forms->size(); $i++) {
        my $lemma_forms = $lemmas_forms->get($i);
        printf "%sLemma: %s\n", $guesser, $lemma_forms->{lemma};
        for (my $i = 0; $i < $lemma_forms->{forms}->size(); $i++) {
          my $form = $lemma_forms->{forms}->get($i);
          printf "  %s %s\n", $form->{form}, $form->{tag};
        }
      }
    }
  }

run_tagger

Simple example performing tokenization and PoS tagging.

  use warnings;
  use strict;
  use open qw(:std :utf8);
  
  use Ufal::MorphoDiTa;
  
  sub encode_entities($) {
    my ($text) = @_;
    $text =~ s/[&<>"]/$& eq "&" ? "&amp;" : $& eq "<" ? "&lt;" : $& eq ">" ? "&gt;" : "&quot;"/ge;
    return $text;
  }
  
  @ARGV >= 1 or die "Usage: $0 tagger_file\n";
  
  print STDERR "Loading tagger: ";
  my $tagger = Ufal::MorphoDiTa::Tagger::load($ARGV[0]);
  $tagger or die "Cannot load tagger from file '$ARGV[0]'\n";
  print STDERR "done\n";
  shift @ARGV;
  
  my $forms = Ufal::MorphoDiTa::Forms->new();
  my $lemmas = Ufal::MorphoDiTa::TaggedLemmas->new();
  my $tokens = Ufal::MorphoDiTa::TokenRanges->new();
  my $tokenizer = $tagger->newTokenizer();
  $tokenizer or die "No tokenizer is defined for the supplied model!";
  
  for (my $not_eof = 1; $not_eof; ) {
    my $text = '';
  
    # Read block
    while (1) {
      my $line = <>;
      last unless ($not_eof = defined $line);
      $text .= $line;
      chomp($line);
      last unless length $line;
    }
  
    # Tag
    $tokenizer->setText($text);
    my $t = 0;
    while ($tokenizer->nextSentence($forms, $tokens)) {
      $tagger->tag($forms, $lemmas);
  
      for (my ($i, $size) = (0, $lemmas->size()); $i < $size; $i++) {
        my $lemma = $lemmas->get($i);
        my $token = $tokens->get($i);
        my ($token_start, $token_length) = ($token->{start}, $token->{length});
  
        printf "%s%s<token lemma=\"%s\" tag=\"%s\">%s</token>%s",
          encode_entities(substr $text, $t, $token_start - $t),
          $i == 0 ? "<sentence>" : "",
          encode_entities($lemma->{lemma}),
          encode_entities($lemma->{tag}),
          encode_entities(substr $text, $token_start, $token_length),
          $i + 1 == $size ? "</sentence>" : "";
        $t = $token_start + $token_length;
      }
    }
    print encode_entities(substr $text, $t);
  }

AUTHORS

Milan Straka <straka@ufal.mff.cuni.cz>

Jana Straková <strakova@ufal.mff.cuni.cz>

COPYRIGHT AND LICENCE

Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of Mathematics and Physics, Charles University in Prague, Czech Republic.

This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.