ufal.morphodita
===============

The ``ufal.morphodita`` is a Python binding to MorphoDiTa library <http://ufal.mff.cuni.cz/morphodita>.

The bindings is a straightforward conversion of the ``C++`` bindings API.
In Python 2, strings can be both ``unicode`` and UTF-8 encoded ``str``, and the
library always produces ``unicode``. In Python 3, strings must be only ``str``.


Wrapped C++ API
---------------

The C++ API being wrapped follows. For a API reference of the original
C++ API, see <http://ufal.mff.cuni.cz/morphodita/api-reference>.

::

  Helper Structures
  -----------------
  
    typedef vector<string> Forms;
    
    struct TaggedForm {
      string form;
      string tag;
    };
    typedef vector<TaggedForm> TaggedForms;
    
    struct TaggedLemma {
      string lemma;
      string tag;
    };
    typedef vector<TaggedLemma> TaggedLemmas;
    
    struct TaggedLemmaForms {
      string lemma;
      TaggedForms forms;
    };
    typedef vector<TaggedLemmaForms> TaggedLemmasForms;
    
    struct TokenRange {
      size_t start;
      size_t length;
    };
    typedef vector<TokenRange> TokenRanges;
  
  
  Main Classes
  ------------
  
    class Version {
     public:
      unsigned major;
      unsigned minor;
      unsigned patch;
    
      static Version current();
    };
    
    class Tokenizer {
     public:
      virtual void setText(const char* text);
      virtual bool nextSentence(Forms* forms, TokenRanges* tokens);
    
      static Tokenizer* newVerticalTokenizer();
      static Tokenizer* newCzechTokenizer();
      static Tokenizer* newEnglishTokenizer();
      static Tokenizer* newGenericTokenizer();
    };
    
    class Morpho {
     public:
      static Morpho* load(const char* fname);
    
      enum { NO_GUESSER = 0, GUESSER = 1 };
    
      virtual int analyze(const char* form, int guesser, TaggedLemmas& lemmas) const;
      virtual int generate(const char* lemma, const char* tag_wildcard, int guesser, TaggedLemmasForms& forms) const;
      virtual string rawLemma(const char* lemma) const;
      virtual string lemmaId(const char* lemma) const;
      virtual string rawForm(const char* form) const;
    
      virtual Tokenizer* newTokenizer() const;
    };
    
    class Tagger {
     public:
      static Tagger* load(const char* fname);
    
      virtual const Morpho* getMorpho() const;
    
      virtual void tag(Forms& forms, TaggedLemmas& tags) const;
    
      Tokenizer* newTokenizer() const;
    };
    
    class TagsetConverter {
     public:
      static TagsetConverter* newIdentityConverter();
      static TagsetConverter* newPdtToConll2009Converter();
      static TagsetConverter* newStripLemmaCommentConverter(const Morpho& morpho);
      static TagsetConverter* newStripLemmaIdConverter(const Morpho& morpho);
    
      virtual void convert(TaggedLemma& lemma) const;
      virtual void convertAnalyzed(TaggedLemmas& lemmas) const;
      virtual void convertGenerated(TaggedLemmasForms& forms) const;
    };


Examples
========

run_morpho_cli
--------------

Simple example performing morphological analysis and generation::

  
  from ufal.morphodita import *
  
  # In Python2, wrap sys.stdin and sys.stdout to work with unicode.
  if sys.version_info[0] < 3:
    import codecs
    import locale
    encoding = locale.getpreferredencoding()
    sys.stdin = codecs.getreader(encoding)(sys.stdin)
    sys.stdout = codecs.getwriter(encoding)(sys.stdout)
  
  if len(sys.argv) < 2:
    sys.stderr.write('Usage: %s dict_file\n' % sys.argv[0])
    sys.exit(1)
  
  sys.stderr.write('Loading dictionary: ')
  morpho = Morpho.load(sys.argv[1])
  if not morpho:
    sys.stderr.write("Cannot load dictionary from file '%s'\n" % sys.argv[1])
    sys.exit(1)
  sys.stderr.write('done\n')
  
  lemmas = TaggedLemmas()
  lemmas_forms = TaggedLemmasForms()
  line = sys.stdin.readline()
  while line:
    tokens = line.rstrip('\r\n').split('\t')
    if len(tokens) == 1: # analyze
      result = morpho.analyze(tokens[0], morpho.GUESSER, lemmas)
  
      guesser = "Guesser " if result == morpho.GUESSER else ""
      for lemma in lemmas:
        sys.stdout.write('%sLemma: %s %s\n' % (guesser, lemma.lemma, lemma.tag))
    elif len(tokens) == 2: # generate
      result = morpho.generate(tokens[0], tokens[1], morpho.GUESSER, lemmas_forms)
  
      guesser = "Guesser " if result == morpho.GUESSER else ""
      for lemma_forms in lemmas_forms:
        sys.stdout.write('%sLemma: %s\n' % (guesser, lemma_forms.lemma))
        for form in lemma_forms.forms:
          sys.stdout.write('  %s %s\n' % (form.form, form.tag))
  
    line = sys.stdin.readline()

run_tagger
----------

Simple example performing tokenization and PoS tagging::

  
  from ufal.morphodita import *
  
  def encode_entities(text):
    return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
  
  # In Python2, wrap sys.stdin and sys.stdout to work with unicode.
  if sys.version_info[0] < 3:
    import codecs
    import locale
    encoding = locale.getpreferredencoding()
    sys.stdin = codecs.getreader(encoding)(sys.stdin)
    sys.stdout = codecs.getwriter(encoding)(sys.stdout)
  
  if len(sys.argv) == 1:
    sys.stderr.write('Usage: %s tagger_file\n' % sys.argv[0])
    sys.exit(1)
  
  sys.stderr.write('Loading tagger: ')
  tagger = Tagger.load(sys.argv[1])
  if not tagger:
    sys.stderr.write("Cannot load tagger from file '%s'\n" % sys.argv[1])
    sys.exit(1)
  sys.stderr.write('done\n')
  
  forms = Forms()
  lemmas = TaggedLemmas()
  tokens = TokenRanges()
  tokenizer = tagger.newTokenizer()
  if tokenizer is None:
    sys.stderr.write("No tokenizer is defined for the supplied model!")
    sys.exit(1)
  
  not_eof = True
  while not_eof:
    text = ''
  
    # Read block
    while True:
      line = sys.stdin.readline()
      not_eof = bool(line)
      if not not_eof: break
      line = line.rstrip('\r\n')
      text += line
      text += '\n';
      if not line: break
  
  
  
    # Tag
    tokenizer.setText(text)
    t = 0
    while tokenizer.nextSentence(forms, tokens):
      tagger.tag(forms, lemmas)
  
      for i in range(len(lemmas)):
        lemma = lemmas[i]
        token = tokens[i]
        sys.stdout.write('%s%s<token lemma="%s" tag="%s">%s</token>%s' % (
          encode_entities(text[t : token.start]),
          "<sentence>" if i == 0 else "",
          encode_entities(lemma.lemma),
          encode_entities(lemma.tag),
          encode_entities(text[token.start : token.start + token.length]),
          "</sentence>" if i + 1 == len(lemmas) else "",
        ))
        t = token.start + token.length
    sys.stdout.write(encode_entities(text[t : ]))


AUTHORS
=======

Milan Straka <straka@ufal.mff.cuni.cz>

Jana Straková <strakova@ufal.mff.cuni.cz>


COPYRIGHT AND LICENCE
=====================

Copyright 2014 by Institute of Formal and Applied Linguistics, Faculty of
Mathematics and Physics, Charles University in Prague, Czech Republic.

MorphoDiTa is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.

MorphoDiTa is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License
along with MorphoDiTa.  If not, see <http://www.gnu.org/licenses/>.
