#!/usr/bin/env ruby
# coding: utf-8

require "rexml/document"
require "libxml"
include LibXML

require File.join(File.dirname(__FILE__), "paula_consts")
include PaulaConsts



names = (["."]+Dir["*"]).select do |x|
  Dir[File.join(x, "*")].include?(File.join(x, "ann_morphosyntax.xml"))
end

# poziomy prezentacji:
# 0 - paulowe
ANNOL_PAULA = 0
# 1 - na poziomie tokenow (segmentacja + lematy + morfosyntaks + sensy)
ANNOL_TOK = 1
# 2 - struktury - slowa synt i grupy
ANNOL_STRUCT = 2
# 3 - nazwy
ANNOL_NAMED = 3
# 4 - tekst
ANNOL_TEXT = 4

ANNOLS = 5


#$stderr.puts ((["."]+Dir["*"])-names).inspect

OUTPUT_ONESEGMENT_WORDS = true

@attrs = {
  :number => %w(sg pl),
  :case => %w(nom gen dat acc inst loc voc),
  :gender => %w(m1 m2 m3 f n),
  :person => %w(pri sec ter),
  :degree => %w(pos com sup),
  :tense => %w(pres past fut),
  :mood => %w(ind imp cond),
  :reflexivity => %w(refl nrefl),
  :aspect => %w(imperf perf),
  :negation => %w(aff neg),
  :accommodability => %w(congr rec),
  :accentability => %w(akc nakc),
  :post_prepositionality => %w(npraep praep),
  :agglutination => %w(agl nagl),
  :vocalicity => %w(nwok wok),
  :fullstoppedness => %w(pun npun),
  :cont => %w(discr ndiscr),
  :brev_pos => %w(NOUN ADJ ADV QUB PREP CONJ VERB PPAS PACT XXX NG PrepNG AdjG DisG),
}

class String
  @@ec_utf_to_isolatin2 = Encoding::Converter.new('utf-8', 'ISO-8859-2')
  UTFLATINS = {
     "”" => '"',
     "„" => '"',
     "–" => "-",
     "…" => "...",
     "—" => "-",
     "à" => "a",
     "’" => "'",
     "«" => "<<",
     "»" => ">>",
     "·" => "-",
     "−" => "-",
     "“" => '"',
     "ê" => "e",
     "á" => "a",
     "ō" => "o",
     "²" => "2",
     "ò" => "o",
     "ë" => "e",
     "•" => "*",
     "−" => "-",
     "è" => "e",
     "Ñ" => "N",
     "ô" => "o",
     "ò" => "é",
     "ñ" => "n",
     "é" => "e",
     "‘" => "'",
     "û" => "u",
  }

  XMLENTITIES = {
    "&" => "&amp;",
    "'" => "&apos;",
    '"' => "&quot;",
    "<" => "&lt;",
    ">" => "&gt;",
  }

   def snake_case!
     self.gsub!(/(.)([A-Z])/,'\1_\2').downcase!
   end
   def snake_case
     self.clone.snake_case!
   end

   def xml_escape
#     self.to_isolatin2.chars.map{|x| XMLENTITIES[x] || x}.join
     self.chars.map{|x| XMLENTITIES[x] || x}.join
   end

   def to_isolatin2
     begin
       tr = chars.map{|x| UTFLATINS[x] || x}.join
       return @@ec_utf_to_isolatin2.convert tr 
     rescue
       $stderr.puts self+": nie da sie skonwertowac"
       return self
     end
   end
end

@attrnamemap = {}
@attrs.each_pair {|atn, atvs| atvs.each{|v| @attrnamemap[v] = atn}}

def msd_decode tag, full_hash = false
  vs = {}
  tag.split(":").each {|tv| vs[@attrnamemap[tv]] = tv}
  assert vs[nil].nil?, vs[nil]
  if full_hash
    @attrs.keys.each{|atn| vs[atn] ||= "-"}
    return vs
  else
    return @attrs.keys.map{|atn| vs[atn] || "-"}
  end
end

def assert cond, msg
  return if cond
  $stderr.puts msg
  raise
end

def warn cond, msg
  return if cond
  $stderr.puts msg
end

def out_header f, dtd, file_id, id, hdrtype = nil
  f.puts "<?xml version=\"1.0\" standalone=\"no\"?>"
  f.puts "<!DOCTYPE paula SYSTEM \"paula_#{dtd}.dtd\">"
  f.puts
  f.puts "<paula version=\"1.0\">"
  f.puts "<header paula_id=\"#{file_id}_#{id}\"#{hdrtype ? "type=\"#{hdrtype}\"" : ""}/>"
end


def write_feat prefix, file_id, filenames, annolevels, base, keys, hashes, basefile, annolevel
  keys.each do |what|
    file = "#{file_id}.#{base}_#{what.to_s}.xml"

    ss = "#{base}_#{what.to_s}file".to_sym
    filenames[ss] = file
    annolevels[annolevel] << [file, "#{base}:#{what.to_s}"]

    File.open(File.join(prefix, file), "w") do |f|
      out_header f, "feat", file_id, "#{base}_#{what.to_s}"
      f.puts
      f.puts "<featList xmlns:xlink=\"http://www.w3.org/1999/xlink\" type=\"#{what.to_s}\" xml:base=\"#{basefile}\">"
      hashes.each do |h|
        f.puts "<feat xlink:href=\"##{h[:id]}\" value=\"#{h[what]}\"/><!-- #{h[:orth]} -->"
      end
      f.puts "</featList>"
      f.puts "</paula>"
    end
  end
end

def do_annoset prefix, file_id, filenames, annolevels

  annofile = "#{file_id}.anno.xml"
  filenames[:annofile] = annofile

  annofeatfile = "#{file_id}.anno_feat.xml"
  annolevels[ANNOL_PAULA] << [annofeatfile, "annoFeat"]

  filenames[:annofeatfile] = annofeatfile
  File.open(File.join(prefix, annofeatfile), "w") do |f|
    out_header f, "feat", file_id, "annoFeat"
    f.puts
    f.puts "<featList xmlns:xlink=\"http://www.w3.org/1999/xlink\" type=\"annoFeat\" xml:base=\"#{filenames[:annofile]}\">"
    alsflatten = []
    annolevels.each{|al| al.each {|a| alsflatten << a}}
    alsflatten.each_with_index do |v, i|
      f.puts "<feat xlink:href=\"#rel_#{i}\" value=\"#{v[1]}\"/>"
      v << i
    end
    f.puts "</featList>"
    f.puts "</paula>"
  end


  File.open(File.join(prefix, annofile), "w") do |f|
    out_header f, "struct", file_id, "annoSet"
    f.puts
    f.puts '<structList xmlns:xlink="http://www.w3.org/1999/xlink" type="annoSet">'
    annolevels.each_with_index do |al, ali|
      next if al.empty?
      f.puts "<struct id=\"anno_#{ali}\">"
      al.each do |fn, d, i|
        f.puts "  <rel id=\"rel_#{i}\" xlink:href=\"#{fn}\"/>"
      end
      f.puts "</struct>\n\n"
    end
    f.puts "</structList>"
    f.puts "</paula>"
  end
end


def do_textlevel prefix, file_id, filenames, annolevels


  parser = XML::Parser.file(File.join(prefix, "text.xml"))
  doc = parser.parse

  doc.root.namespaces.default_prefix="tei"

  fulltext = ""
  divs = []
  abs = {}
  doc.find("tei:TEI/tei:text/tei:body/tei:div").each do |div|
    divabs = []
    id = div.attributes['id']
    decl = div.attributes['decls']
    divs << {:id => id, :decls => decl, :abs => divabs}
    div.find("tei:ab").each do |ab|
      n = ab.attributes['n']
      id = ab.attributes['id']
      txt = ab.inner_xml
      start = fulltext.length
      _ab = {:id => id, :n => n, :text => txt, :start => start, :dividx => divs.length-1}
      abs[id] = _ab
      divabs << _ab
      fulltext += txt + "\n"
    end
    fulltext += "\n"
  end

  fulltext = fulltext[0..-2]

  textfile = "#{file_id}.text.xml"
  File.open(File.join(prefix, textfile), "w") do |f|
    out_header f, "text", file_id, "text", "text"
    f.puts "<body>"+fulltext+"</body>"
    f.puts "</paula>"
  end

  filenames[:textfile] = textfile
  annolevels[ANNOL_TEXT] << [textfile, "text"]

  return abs
end

def do_segmentation prefix, file_id, abs, filenames, annolevels


  parser = XML::Parser.file(File.join(prefix, "ann_segmentation.xml"))
  doc = parser.parse

  doc.root.namespaces.default_prefix="tei"

  segtion_pars = {}
  segtion_sents = {}
  segtion_segs = {}
  pars = []
  doc.find("tei:TEI/tei:text/tei:body/tei:p").each do |par|
    corresp = par.attributes['corresp']
    id = par.attributes['id']
    parsents = []
    _par = {:seg_id => id, :sents => parsents}
    pars << _par
    segtion_pars[id] = _par

    par.find("tei:s").each do |sent|
      id = sent.attributes['id']
      senttoks = []
      _sent = {:seg_id => id, :toks => senttoks}
      parsents << _sent
      segtion_sents[id] = _sent

      sent.find("tei:seg | tei:choice/tei:seg | tei:choice/nkjp:paren/tei:seg").each do |seg|
        next if seg.attributes['rejected'] == "true"
        id = seg.attributes['id']
        nospace = seg.attributes['nps'] == "true"
        cor = seg.attributes['corresp']
        p1 = cor.index("(")+1
        p2 = cor.index(",", p1)
        p3 = cor.index(",", p2+1)
        cid = cor[p1..p2-1]
        cst = cor[p2+1..p3-1].to_i
        start = cst + abs[cid][:start]
        len = cor[p3+1..-2].to_i
        tok = {:seg_id => id, :start => start, :len => len, :nospace => nospace, :orth => abs[cid][:text][cst, len]}
        segtion_segs[id] = tok
        senttoks << tok
      end
    end
  end


  # zapis kolejnych plikow z markablami

  tokfile = "#{file_id}.tok.xml"

  File.open(File.join(prefix, tokfile), "w") do |f|
    out_header f, "mark", file_id, "tok"
    f.puts "<markList xmlns:xlink=\"http://www.w3.org/1999/xlink\" type=\"tok\" xml:base=#{filenames[:textfile]}>"
    pars.map{|p| p[:sents].map{|s| s[:toks]}}.flatten.each_with_index do |t, tid|
      t[:id] = "tok_#{tid}"
      f.puts "<mark id=\"#{t[:id]}\" xlink:href=\"#xpointer(string-range(//body,'',#{t[:start]},#{t[:len]}))\"/><!-- #{t[:orth]} -->"
    end
    f.puts "</markList>"
    f.puts "</paula>"
  end

  sentfile = "#{file_id}.sent.xml"
  File.open(File.join(prefix, sentfile), "w") do |f|
    out_header f, "mark", file_id, "sent"
    f.puts "<markList xmlns:xlink=\"http://www.w3.org/1999/xlink\" type=\"sent\" xml:base=#{tokfile}>"
    pars.map{|p| p[:sents]}.flatten.each_with_index do |s, sid|
      s[:id] = "s_#{sid}"
      href = s[:toks].length == 1 ? "##{s[:toks].first[:id]}" : "#xpointer(id('#{s[:toks].first[:id]}')/range-to(id('#{s[:toks].last[:id]}')))"
      f.puts "<mark id=\"#{s[:id]}\" xlink:href=\"#{href}\"/><!--#{s[:toks][0..2].map{|t| (t[:nospace] ? "" : " ")+t[:orth]}.join} ... -->"
    end
    f.puts "</markList>"
    f.puts "</paula>"
  end

  parafile = "#{file_id}.para.xml"
  File.open(File.join(prefix, parafile), "w") do |f|
    out_header f, "mark", file_id, "para"
    f.puts "<markList xmlns:xlink=\"http://www.w3.org/1999/xlink\" type=\"para\" xml:base=#{sentfile}>"
    pars.each_with_index do |p, pid|
      p[:id] = "p_#{pid}"
      href = p[:sents].length == 1 ? "##{p[:sents].first[:id]}" : "#xpointer(id('#{p[:sents].first[:id]}')/range-to(id('#{p[:sents].last[:id]}')))"
      f.puts "<mark id=\"#{p[:id]}\" xlink:href=\"#{href}\"/><!--#{p[:sents].map{|s| s[:toks]}.flatten[0..2].map{|t| (t[:nospace] ? "" : " ")+t[:orth]}.join} ... -->"
    end
    f.puts "</markList>"
    f.puts "</paula>"
  end

  filenames[:tokfile] = tokfile
  filenames[:sentfile] = sentfile
  filenames[:parafile] = parafile

  annolevels[ANNOL_TOK] << [tokfile, "token"]
  annolevels[ANNOL_TOK] << [sentfile, "sentence"]
  annolevels[ANNOL_TOK] << [parafile, "paragraph"]

  return pars, segtion_pars, segtion_sents, segtion_segs
end

def do_morphosyntax prefix, file_id, filenames, annolevels, pars, segtion_pars, segtion_sents, segtion_segs

  
  parser = XML::Parser.file(File.join(prefix, "ann_morphosyntax.xml"))
  doc = parser.parse

  doc.root.namespaces.default_prefix="tei"

  morpho_pars = {}
  morpho_sents = {}
  morpho_segs = {}

  doc.find("tei:TEI/tei:text/tei:body/tei:p").each do |par|
    id = par.attributes['id']
    cor = par.attributes['corresp']
    sid = cor[cor.index("#")+1..-1]
    morpho_pars[id] = segtion_pars[sid]
    morpho_pars[id][:morph_id] = id

    par.find("tei:s").each do |sent|
      id = sent.attributes['id']
      cor = sent.attributes['corresp']
      sid = cor[cor.index("#")+1..-1]
      morpho_sents[id] = segtion_sents[sid]
      morpho_sents[id][:morph_id] = id

      sent.find("tei:seg").each do |seg|
        id = seg.attributes["id"]
        cor = seg.attributes['corresp']
        sid = cor[cor.index("#")+1..-1]
        morpho_segs[id] = segtion_segs[sid]

        tok = morpho_segs[id]

        assert !tok.nil?, "#{prefix}, #{id}"

        tok[:morph_id] = id

        se = seg.find("tei:fs[@type='morph']").first
        orth = se.find("tei:f[@name='orth']/tei:string").first.inner_xml

#        assert (orth == tok[:orth]), "#{orth} != #{tok[:orth]}"

        # find chosen interp
        ident = se.find("tei:f[@name='disamb']/tei:fs/tei:f[@name='choice']").first.attributes['fVal']
        ident = ident[1..-1] if ident[0] == "#"
        chosen = se.find("tei:f/tei:fs/tei:f/tei:symbol[@xml:id='#{ident}']").first
        chosen ||= se.find("tei:f/tei:fs/tei:f/tei:vAlt/tei:symbol[@xml:id='#{ident}']").first
        msd = chosen.attributes["value"]
        lex = chosen.parent.parent
        lex = lex.parent if lex.name == "f" && lex.attributes["name"] == "msd"

        base = lex.find("tei:f[@name='base']/tei:string").first.inner_xml
        pos = lex.find("tei:f[@name='ctag']/tei:symbol").first.attributes['value']

        tok[:lemma] = base
        tok[:pos] = pos
        tok[:msd] = msd
      end
    end
  end

  # zapis kolejnych plikow z anotacja na poziomie tokenow

  write_feat prefix, file_id, filenames, annolevels, "tok", [:pos, :lemma, :msd], pars.map{|p| p[:sents].map{|s| s[:toks]}}.flatten, filenames[:tokfile], ANNOL_TOK

  return morpho_pars, morpho_sents, morpho_segs
end

def do_words prefix, file_id, filenames, annolevels, morpho_pars, morpho_sents, morpho_segs

  return {}, {}, {} unless File.exists?(File.join(prefix, "ann_words.xml"))

  wor_pars = {}
  wor_sents = {}
  wor_segs = {}

  parser = XML::Parser.file(File.join(prefix, "ann_words.xml"))
  doc = parser.parse

  sts = []

  doc.root.namespaces.default_prefix="tei"

  doc.find("tei:TEI/tei:text/tei:body/tei:p").each do |par|
    id = par.attributes['id']
    cor = par.attributes['corresp']
    sid = cor[cor.index("#")+1..-1]
    wor_pars[id] = morpho_pars[sid]
    wor_pars[id][:wor_id] = id

    par.find("tei:s").each do |sent|
      id = sent.attributes['id']
      cor = sent.attributes['corresp']
      sid = cor[cor.index("#")+1..-1]
      sen = morpho_sents[sid]
      wor_sents[id] = sen
      sen[:wor_id] = id
      sen_wors = []
      sen[:words] = sen_wors
      sts << sen

      sent.find("tei:seg").each do |seg|
        id = seg.attributes["id"]

#        puts id
        wo = seg.find("tei:fs[@type='words']").first
        orth = wo.find("tei:f[@name='orth']/tei:string").first.inner_xml
        base = wo.find("tei:f[@name='base']/tei:string").first.inner_xml
        pos = wo.find("tei:f[@name='ctag']/tei:symbol").first.attributes['value']
        msd = wo.find("tei:f[@name='msd']/tei:symbol").first.attributes['value']
        ptrs = []
        seg.find("tei:ptr").each {|x| ptrs << x.attributes["target"]}
        next if ptrs.empty?

        wor = {:orth => orth, :lemma => base, :pos => pos, :msd => msd, :ptrs => ptrs,
          :wor_id => id, :sent => sen}

        wor_segs[id] = wor
        sen_wors << wor
      end
    end
  end

  # rozwinac wskazniki
  #
  wor_segs.each_value do |wor|
    wor[:ptrs] = wor[:ptrs].map do |ptr|
      if ptr.start_with? "ann_morphosyntax.xml"
        morpho_segs[ptr[21..-1]]
      else
        wor_segs[ptr[(ptr[0] == "#" ? 1 : 0)..-1]]
      end
    end
    wor[:level] = 1
  end

  # posortowac bottom-up w zdaniach

  morpho_segs.each_value do |t|
    t[:level] = 0
  end

  acted = true
  while acted
    acted = false
    wor_segs.each_value do |wor|
      oldl = wor[:level]
      wor[:level] = wor[:ptrs].map{|x| x[:level]}.max+1
      acted = oldl != wor[:level]
    end
  end

  wor_sents.each_value do |sen|
    sen[:words].each do |w|
      lastok = w[:ptrs].last
      while !lastok[:wor_id].nil?
        lastok = lastok[:ptrs].last
      end
      w[:ends_on] = lastok[:id]
    end
    sen[:words].sort!{|a,b| a[:ends_on] == b[:ends_on] ? a[:level] <=> b[:level] : a[:ends_on] <=> b[:ends_on]}
  end

  # zapisac .words.xml
  
  wordsfile = "#{file_id}.words.xml"

  tokfile = filenames[:tokfile]
  relid = 0
  wordid = 0
  allwords = []
  File.open(File.join(prefix, wordsfile), "w") do |f|
    out_header f, "struct", file_id, "words"
    f.puts
    f.puts '<structList xmlns:xlink="http://www.w3.org/1999/xlink" type="words">'

    sts.each do |sen|
      sen[:words].each do |wor|
        wor[:id] = "word_#{wordid}"
        wordid += 1
        allwords << wor
        f.puts "<struct id=\"#{wor[:id]}\"><!-- #{wor[:orth]} -->"
        wor[:ptrs].each do |p|
          href = p[:wor_id] ? "##{p[:id]}" : "#{tokfile}##{p[:id]}"
          f.puts "<rel id=\"rel_#{relid}\" xlink:href=\"#{href}\"/>"
          relid += 1
        end
        f.puts "</struct>"
      end
    end

    f.puts "</structList>"
    f.puts "</paula>"
  end

  filenames[:wordsfile] = wordsfile

  annolevels[ANNOL_STRUCT] << [wordsfile, "syntactic words"]

  # zapis kolejnych plikow z anotacja na poziomie slow synt

  write_feat prefix, file_id, filenames, annolevels, "words", [:pos, :lemma, :msd], allwords, wordsfile, ANNOL_STRUCT

  return wor_pars, wor_sents, wor_segs

end

def do_groups prefix, file_id, filenames, annolevels, wor_pars, wor_sents, wor_segs


  return unless File.exists?(File.join(prefix, "ann_groups.xml"))

  gr_pars = {}
  gr_sents = {}
  gr_segs = {}

  parser = XML::Parser.file(File.join(prefix, "ann_groups.xml"))
  doc = parser.parse

  doc.root.namespaces.default_prefix="tei"

  sts = []
  doc.find("tei:TEI/tei:text/tei:body/tei:p").each do |par|
    id = par.attributes['id']
    cor = par.attributes['corresp']
    sid = cor[cor.index("#")+1..-1]
    gr_pars[id] = wor_pars[sid]
    gr_pars[id][:gr_id] = id

    par.find("tei:s").each do |sent|
      id = sent.attributes['id']
      cor = sent.attributes['corresp']
      sid = cor[cor.index("#")+1..-1]
      sen = wor_sents[sid]
      gr_sents[id] = sen
      sen[:gr_id] = id
      sen_grs = []
      sen[:groups] = sen_grs
      sts << sen

      sent.find("tei:seg").each do |seg|
        gr = seg.find("tei:fs[@type='group']").first
        orth = gr.find("tei:f[@name='orth']/tei:string").first.inner_xml
        type = gr.find("tei:f[@name='type']/tei:symbol").first.attributes["value"]
        ptrs = []
        seg.find("tei:ptr").each {|x| ptrs << {:pt => x.attributes["target"], 
          :type => x.attributes["type"]}}
        next if ptrs.empty?
        id = seg.attributes["id"]
        gr_ = {:gr_id => id, :orth => orth, :type => type, :ptrs => ptrs}
        gr_segs[id] = gr_
        sen_grs << gr_
      end
    end
  end

  # rozwinac wskazniki
  #
  gr_segs.each_value do |gr|
    gr[:ptrs] = gr[:ptrs].map do |ptr|
      p = nil
      if ptr[:pt].start_with? "ann_words.xml"
        p = wor_segs[ptr[:pt][14..-1]]
      else
        p = gr_segs[ptr[:pt][(ptr[:pt][0] == "#" ? 1 : 0)..-1]]
      end
      gr[ptr[:type].to_sym] = p unless ptr[:type] == "nonhead"
      p
    end
    gr[:glevel] = 1
  end

  # posortowac bottom-up w zdaniach

  wor_segs.each_value do |w|
    w[:glevel] = 0
  end

  acted = true
  while acted
    acted = false
    gr_segs.each_value do |gr|
      oldl = gr[:glevel]
      gr[:glevel] = gr[:ptrs].map{|x| x[:glevel]}.max+1
      acted = oldl != gr[:glevel]
    end
  end

  gr_sents.each_value do |sen|
    sen[:groups].each do |g|
      lastok = g[:ptrs].last
      while lastok[:ends_on].nil?
        lastok = lastok[:ptrs].last
      end
      g[:ends_on] = lastok[:ends_on]
    end
    sen[:groups].sort!{|a,b| a[:ends_on] == b[:ends_on] ? a[:glevel] <=> b[:glevel] : a[:ends_on] <=> b[:ends_on]}
  end

  # zapisac .groups.xml
  
  groupsfile = "#{file_id}.groups.xml"

  wordsfile = filenames[:wordsfile]
  relid = 0
  grid = 0
  allgrs = []
  allrels = []
  File.open(File.join(prefix, groupsfile), "w") do |f|
    out_header f, "struct", file_id, "groups"
    f.puts
    f.puts '<structList xmlns:xlink="http://www.w3.org/1999/xlink" type="groups">'

    sts.each do |sen|
      sen[:groups].each do |gr|
        gr[:id] = "group_#{grid}"
        grid += 1
        allgrs << gr
        f.puts "<struct id=\"#{gr[:id]}\"><!-- #{gr[:orth]} -->"
        gr[:ptrs].each do |p|
          id = "rel_#{relid}"
          relid += 1
          case p
          when gr[:head] then allrels << ["head", id]
          when gr[:semh] then allrels << ["semh", id]
          when gr[:synh] then allrels << ["synh", id]
          end
          href = p[:gr_id] ? "##{p[:id]}" : "#{wordsfile}##{p[:id]}"
          f.puts "<rel id=\"#{id}\" xlink:href=\"#{href}\"/>"
        end
        f.puts "</struct>"
      end
    end
    f.puts "</structList>"
    f.puts "</paula>"
  end

  filenames[:groupsfile] = groupsfile
  annolevels[ANNOL_STRUCT] << [groupsfile, "syntactic groups"]

  # zapis plikow z anotacja na poziomie grup/relacji miedzy nimi (type, heads)

  write_feat prefix, file_id, filenames, annolevels, "groups", [:type], allgrs, groupsfile, ANNOL_STRUCT

  funcfile = "#{file_id}.groups_func.xml"

  File.open(File.join(prefix, funcfile), "w") do |f|
    out_header f, "feat", file_id, "groups_func"
    f.puts
    f.puts "<featList xmlns:xlink=\"http://www.w3.org/1999/xlink\" type=\"func\" xml:base=\"#{groupsfile}\">"
    allrels.each do |r|
      f.puts "<feat xlink:href=\"##{r[1]}\" value=\"#{r[0]}\"/>"
    end
    f.puts "</featList>"
    f.puts "</paula>"
  end


  filenames[:groups_funcfile] = funcfile

  annolevels[ANNOL_STRUCT] << [funcfile, "group head(s)"]

end

def do_named prefix, file_id, filenames, annolevels, morpho_pars, morpho_sents, morpho_segs

  
  return unless File.exists?(File.join(prefix, "ann_named.xml"))

  nam_pars = {}
  nam_sents = {}
  nam_segs = {}

  parser = XML::Parser.file(File.join(prefix, "ann_named.xml"))
  doc = parser.parse

  sts = []

  doc.root.namespaces.default_prefix="tei"

  doc.find("tei:TEI/tei:text/tei:body/tei:p").each do |par|
    id = par.attributes['id']
    cor = par.attributes['corresp']
    sid = cor[cor.index("#")+1..-1]
    nam_pars[id] = morpho_pars[sid]
    nam_pars[id][:nam_id] = id

    par.find("tei:s").each do |sent|
      id = sent.attributes['id']
      cor = sent.attributes['corresp']
      sid = cor[cor.index("#")+1..-1]
      sen = morpho_sents[sid]
      nam_sents[id] = sen
      sen[:nam_id] = id
      sen_nms = []
      sen[:nameds] = sen_nms
      sts << sen

      sent.find("tei:seg").each do |seg|
        id = seg.attributes["id"]

        nm = seg.find("tei:fs[@type='named']").first
        orth = nm.find("tei:f[@name='orth']/tei:string").first.inner_xml

        base_ = nm.find("tei:f[@name='base']/tei:string").first
        base = base_ ? base_.inner_xml : "--"

        type_ = nm.find("tei:f[@name='type']/tei:symbol").first
        type = type_ ? type_.attributes['value'] : "--"

        subtype_ = nm.find("tei:f[@name='subtype']/tei:symbol").first
        subtype = subtype_ ? subtype_.attributes['value'] : "--"

        when_ = nm.find("tei:f[@name='when']/tei:string").first
        thewhen = when_ ? when_.inner_xml : "--"

        certainty_ = nm.find("tei:f[@name='certainty']/tei:symbol").first
        certainty = certainty_ ? certainty_.attributes['value'] : "--"

        comment_ = nm.find("tei:f[@name='comment']/tei:string").first
        comment = comment_ ? comment_.inner_xml : "--"

        der = nm.find("tei:f[@name='derived']/tei:fs[@type='derivation']").first
        derivType = "--"
        derivedFrom = "--"
        if der
          derivType = der.find("tei:f[@name='derivType']/tei:symbol").first.attributes['value']
          derivedFrom = der.find("tei:f[@name='derivedFrom']/tei:string").first.inner_xml
        end

        ptrs = []
        seg.find("tei:ptr").each {|x| ptrs << x.attributes["target"]}
        next if ptrs.empty?

         named = {:nam_id => id, :orth => orth, :lemma => base, :type => type, :subtype => subtype, :when => thewhen, :certainty => certainty, :comment => comment, :derivType => derivType, :derivedFrom => derivedFrom, :ptrs => ptrs}
        sen_nms << named
        nam_segs[id] = named
      end
    end
  end


  # rozwinac wskazniki
  #
  nam_segs.each_value do |nam|
    nam[:ptrs] = nam[:ptrs].map do |ptr|
      if ptr.start_with? "ann_morphosyntax.xml"
        morpho_segs[ptr[21..-1]]
      else
        nam_segs[ptr[(ptr[0] == "#" ? 1 : 0)..-1]]
      end
    end
    nam[:nlevel] = 1
  end

  # posortowac bottom-up w zdaniach

  morpho_segs.each_value do |t|
    t[:nlevel] = 0
  end

  acted = true
  while acted
    acted = false
    nam_segs.each_value do |nam|
      oldl = nam[:nlevel]
      nam[:nlevel] = nam[:ptrs].map{|x| x[:nlevel]}.max+1
      acted = oldl != nam[:nlevel]
    end
  end

  nam_sents.each_value do |sen|
    sen[:nameds].each do |n|
      lastok = n[:ptrs].last
      while !lastok[:nam_id].nil?
        lastok = lastok[:ptrs].last
      end
      n[:ends_on] = lastok[:id]
    end
    sen[:nameds].sort!{|a,b| a[:ends_on] == b[:ends_on] ? a[:nlevel] <=> b[:nlevel] : a[:ends_on] <=> b[:ends_on]}
  end

  # zapisac .named.xml
  
  namedfile = "#{file_id}.named.xml"

  tokfile = filenames[:tokfile]
  relid = 0
  namid = 0
  allnms = []
  File.open(File.join(prefix, namedfile), "w") do |f|
    out_header f, "struct", file_id, "named"
    f.puts
    f.puts '<structList xmlns:xlink="http://www.w3.org/1999/xlink" type="named">'

    sts.each do |sen|
      sen[:nameds].each do |nam|
        nam[:id] = "named_#{namid}"
        namid += 1
        allnms << nam
        f.puts "<struct id=\"#{nam[:id]}\"><!-- #{nam[:orth]} -->"
        nam[:ptrs].each do |p|
          href = p[:nam_id] ? "##{p[:id]}" : "#{tokfile}##{p[:id]}"
          f.puts "<rel id=\"rel_#{relid}\" xlink:href=\"#{href}\"/>"
          relid += 1
        end
        f.puts "</struct>"
      end
    end

    f.puts "</structList>"
    f.puts "</paula>"
  end

  # zapis kolejnych plikow z anotacja na poziomie jednostek nazwanych

  write_feat prefix, file_id, filenames, annolevels, "named", [:type, :subtype, :when, :derivedFrom, :derivType, :certainty, :comment], allnms, namedfile, ANNOL_NAMED

  filenames[:namedfile] = namedfile
  annolevels[ANNOL_NAMED] << [namedfile, "named entities"]

end

def do_senses prefix, file_id, filenames, annolevels, segtion_pars, segtion_sents, segtion_segs

  
  return unless File.exists?(File.join(prefix, "ann_senses.xml"))

  parser = XML::Parser.file(File.join(prefix, "ann_senses.xml"))
  doc = parser.parse

  doc.root.namespaces.default_prefix="tei"

  sens_pars = {}
  sens_sents = {}
  sens_segs = {}
  allsens = []

  doc.find("tei:TEI/tei:text/tei:body/tei:p").each do |par|
    id = par.attributes['id']
    cor = par.attributes['corresp']
    sid = cor[cor.index("#")+1..-1]
    sens_pars[id] = segtion_pars[sid]
    sens_pars[id][:sens_id] = id

    par.find("tei:s").each do |sent|
      id = sent.attributes['id']
      cor = sent.attributes['corresp']
      sid = cor[cor.index("#")+1..-1]
      sens_sents[id] = segtion_sents[sid]
      sens_sents[id][:sens_id] = id

      sent.find("tei:seg").each do |seg|
        id = seg.attributes["id"]
        cor = seg.attributes['corresp']
        sid = cor[cor.index("#")+1..-1]
        sens_segs[id] = segtion_segs[sid]

        tok = sens_segs[id]
        tok[:sens_id] = id

        sens = seg.find("tei:fs[@type='sense']/tei:f[@name='sense']").first.attributes['fVal']
        tok[:sense] = sens
        allsens << tok
      end
    end
  end

  write_feat prefix, file_id, filenames, annolevels, "tok", [:sense], allsens, filenames[:tokfile], ANNOL_TOK

end

 
def process_file prefix


  file_id = "nkjp-#{File.basename(prefix)}"

  startt = Time.now

  filenames = {}
  annolevels = (1..ANNOLS).map{|x| Array.new}

  abs = do_textlevel prefix, file_id, filenames, annolevels

  pars, segtion_pars, segtion_sents, segtion_segs = do_segmentation prefix, file_id, abs, filenames, annolevels

  morpho_pars, morpho_sents, morpho_segs = do_morphosyntax prefix, file_id, filenames, annolevels, pars, segtion_pars, segtion_sents, segtion_segs

  wor_pars, wor_sents, wor_segs = do_words prefix, file_id, filenames, annolevels, morpho_pars, morpho_sents, morpho_segs


  do_groups prefix, file_id, filenames, annolevels, wor_pars, wor_sents, wor_segs

  do_named prefix, file_id, filenames, annolevels, morpho_pars, morpho_sents, morpho_segs

  do_senses prefix, file_id, filenames, annolevels, segtion_pars, segtion_sents, segtion_segs

  do_annoset prefix, file_id, filenames, annolevels

  write_dtds prefix

  $stderr.printf "#{prefix} loaded, %.1fs, #{morpho_sents.length} sents, #{morpho_segs.length} segs\n", Time.now-startt
 
  return morpho_sents.length, morpho_segs.length
end


sentcntr = 0
segcntr = 0

names = ["110-2-000000034", "120-2-000010", "040-1-000004", "110-3-000009", "110-4-000000102", "110-4-000000002"]
names.each_with_index do |n, i|
  sn, sg = process_file(n)
  sentcntr += sn
  segcntr += sg
end

