#!/usr/bin/perl

# EvalBlark_1_9.cgi (050121)

# Changes from EvalBlark_1_6.cgi :
#   * added lexical control in TextGrids
#   * added grapheme control in TextGrids

# ---------------- SETTINGS ------------------

$ABC = 'a-zA-Z';
$punc = '\-\_\.'."\\'";
$GoodGraphs = $ABC.$punc;

# @PoS = ('N','V','Adj','Adv','Num','Prep','Pron','Konj','Art','Int');
# %PoS = map( ($_=>1), @PoS );
# $PoSlist = "{".join(",",@PoS)."}";

# -------------- INPUT ---------------- 

read(STDIN,$datax,%ENV{'CONTENT_LENGTH'});

($bound) = $datax=~/^(--+\S+)/;

for (split /$bound\r?\n?/,$datax) {
  $t=$_;  $t=~s/^\r?\n//; $t=~s/^\r?\n//;
  ($res) = $t=~/\bname=['"]?RAVN([A-Z]+)/;
  $t=~s!^content-.+\r?\n!!gim;
  if ( ($resX) = $res=~/^CHECK([A-Z]+)/ ) { 
    $res{$resX} = `cat /tmp/RAVN4321_$resX`;
    $sampaOK="SAMPA er fundet (".length($res{'SAMPA'}).")" if $resX eq 'SAMPA';
  } elsif ($t=~/\S/ and $res) { 
    $res{$res}=$t; open(RES,">/tmp/RAVN4321_$res"); print RES $t; close(RES);
  };
};

for ($res{'PARTOFSPEECH'}=~/(\S+)/g) {$PoS{$_}++}; 

# -------------- MAIN --------------

#   *********** SAMPA defined **************'

@sampa = sort {length($b)<=>length($a)} $res{'SAMPA'}=~/(\S+)/g;

#   ********** LEX checked ***********

for $lin (split /\r*\n+/,$res{'LEX'}) {
  $entc++;
  next if $lin=~/^[\s#]/ or $lin=~/^---/ or $lin!~/\S/;   # comment/separ/blank
  @f0 = split /\t/,$lin;
  $gold = $lin=~/^ORTO:/;
  
  if ($gold) {
    $badgold = $lin;
    $badgold =~ s/^ORTO:\S+\tPPOS:\S+(\t+PHON:\S+)+//;  # oblig. part
    $badgold =~ s/\t+COMM:[^\t]*//g;                    # optional comments
    $badgold =~ s/\t//g;    # <tab> is permittet - any remainder is non-gold!
    @f=();
    ($f[0] = $f0[0]) =~ s/^ORTO://;
    ($f[2] = $f0[1]) =~ s/^PPOS://;
    for (2..$#f0) { $f0[$_] =~ s/^PHON:(.*)// and $f[1].="[$1]" }
  } elsif ($f0[1]=~/^\[.*?\]/) {  # line format as in IS_170719_ordbog.csv
    @f = @f0;
  } else {  # line format as in RAVN1234_LEX
    @f = $f0[3]=~/\S/? 
     ($f0[0],"[$f0[2]][$f0[3]]",$f0[1]): 
     ($f0[0],"[$f0[2]]",$f0[1]);
  };

  $f = join '',@f[0..2];
  $lexforms{ $f[0] }++;

  if (length $badgold) {
    $errtab .= &tabfy( $entc, '<nobr>Unpure</nobr>', 
                       "c(".ord($badgold).")&#8658;$badgold", $lin );
###    $errtab .= &tabfy( $entc, 'Non-gold', ord($badgold), $lin );
  };

  if ($c = $f=~s/(\s)/$1/g) {
    $errtab .= &tabfy( $entc, 'Whitespace', $c, $lin );
  };

  if (@e = $f[0]=~/([^$GoodGraphs])/g) {
    $errtab .=
      &tabfy( $entc, 'Bad character', '{'.join(',',@e).'}', "\"$f[0]\"");
  };

  $_=$f[1];
  while (s/\[(\S*?)\]//) {
    $p=$1; $o2p{lc($f[0])} ||= $p;
    if ($p!~/\S/) { $errtab .= &tabfy( $entc, 'Empty ph-form', '[]', $f[1]) } 
    elsif (scalar(@sampa)>1) {
      for (@sampa) {$p=~s!$_!!g and $seenphone{$_}++};
      $errtab .= &tabfy( $entc, 'Unknown phones', "[$p]", $f[1] ) if $p=~/\S/;
    };
  };
  $errtab .= &tabfy( $entc, 'ph-garbage', $_, $f[1] ) if /\S/;

  $errtab .= &tabfy( $entc, 'no PoS', '', $lin ) if $f[2]!~/\S/;
  for $pos (split /,/,$f[2]) {
    if (%PoS and !$PoS{$pos}) {
      $errtab .= &tabfy( $entc, ($pos? 'Bad':'Empty').' PoS', $pos, $lin );
    };
  };

###  $errtab .= &tabfy( $entc, 'Bad frequency', $f[3], $lin ) if $f[3]=~/\D/;

};

#   ********** TextGrid(s) checked ***********

if ($res{'TEXTGRID'}) {
  my($txt) = $res{'TEXTGRID'}; chop($txt); chop($txt);

  my($phtier) = $txt=~/^\s+name = "\S*SAMPA\S*"(.+?)   item \[/msi;
  $phtier =~ s/\n//g; $phtier =~ s/ +intervals /\nintervals /g;
  for (split /\n/,$phtier) { 
    my($int,$p) = /^intervals \[(\d+)\].*?text = "(.*)"/; $p=~s/""/"/g;
    $p0=$p;
    for (@sampa) {$p=~s!$_!!g and $seenphoneTG{$_}++}; $p=~s/ //g;
    $errtextgrids .= &tabfy( $int, 'Unknown phones', "[$p]", "[$p0]" ) if $p=~/\S/;
  };

  my($orttier) = $txt=~/^\s+name = "\S*orthography\S*"(.+?)   item \[/msi;
  $orttier =~ s/\n//g; $orttier =~ s/ +intervals /\nintervals /g;
  for (split /\n/,$orttier) { 
    my($int,$o) = /^intervals \[(\d+)\].*?text = "(.*)"/;
    $o=~s/""/"/g; $o=~s/<[\w ]*>/ /g; $o=~s/ +/ /g; 

    my($badgraphs) = '"'.(join '" "',$o=~/([^ $GoodGraphs])/g).'"';
    $errtextgrids .= &tabfy($int, 'Illegal graphemes', $badgraphs, "\"$o\"" ) 
      if $badgraphs=~/[^ "]/;
      
    for ($o=~/([$GoodGraphs]+)/g) { 
      $lexforms{$_} or 
      $errtextgrids .= &tabfy($int, 'Not in lexicon', "\"$_\"", "\"$o\"" ) 
    };
  };
}

# ------------------- MANUS checked ----------------------

if (%lexforms) {
  for $wd ($res{'MANUS'}=~/([$GoodGraphs]+)/g) {
    ($wdXpunc=$wd) =~ s/[$punc]+$//g;
    $mx{$wdXpunc}++ unless $lexforms{$wd} or $lexforms{&lowerchar($wd)}
      or $lexforms{$wdXpunc} or $lexforms{&lowerchar($wdXpunc)};
    $p=$o2p{lc($wd)};
    for (@sampa) {$p=~s!$_!!g and $seenphoneMS{$_}++}; $p=~s/ //g;
  };
  for (sort keys %mx) {$errmanus .= " &nbsp; '$_' ($mx{$_})<br>\n" };
}

# ---------------------- SAMPA checked --------------------

for (@sampa) { 
  $sampa0 .= "[$_]" unless $seenphone{$_};
  $sampa00 .= "[$_]" unless $seenphoneTG{$_};
  $sampa000 .= "[$_]" unless $seenphoneMS{$_};
};

# -------------- OUTPUT ---------------- 

$txtbody = <<RAVNEMOR
<form action="https://lab.homunculus.dk/cgi-bin/Ravn/EvalBlark_1_9.cgi" 
      method=POST enctype="multipart/form-data">

<table cellpadding=5 border=2>

<tr>
<td>

  SAMPA (phone inventory) as .txt <br>
  Select previous <input type="checkbox" name="RAVNCHECKSAMPA" checked>
  or
  <input type="file" name="RAVNSAMPA" accept=".txt">

  </td><td>

  PAROLE (PoS inventory) as .txt <br>
   Select previous <input type="checkbox" name="RAVNCHECKPARTOFSPEECH" checked>
  or
  <input type="file" name="RAVNPARTOFSPEECH" accept=".txt">

</td>

<td>
  Lexicon (annotated word forms; <i>GOLD</i>-standard is supported) as .csv <br>
    Select previous <input type="checkbox" name="RAVNCHECKLEX">
  or
 <input type="file" name="RAVNLEX" accept=".csv">
  <p>
</td></tr>

<tr>
<td>
  Transcription as .textGrid (ISO-8859)<br>
   Select previous <input type="checkbox" name="RAVNCHECKTEXTGRID">
  or
  <input type="file" name="RAVNTEXTGRID" accept=".textGrid">
</td>
<td>

  Manus as .txt (recommended) or .rtf (experimental)<br>
    Select previous <input type="checkbox" name="RAVNCHECKMANUS">
  or
 <input type="file" name="RAVNMANUS" accept=".txt">
</td>
<td>


  <input value="Upload resources" type=submit>
</td></tr>
</table>


</form>
RAVNEMOR
;

$report .= $entc==0? "<h3>No LEXICON uploaded</h3>": 
          $errtab!~/\S/? "<h3>No errors found in LEXICON</h3>":
          "<p>\n<hr><b>Errors found in LEXICON</b> (in $entc entries)<p>
           <table border=1>".
          "<tr><th>line</th><th>errortype</th>".
          "<th>instance</th><th>field</th></td>$errtab\n</table>\n<hr>";

$report .= length($res{'TEXTGRID'})==0? "<h3>No TEXTGRID uploaded</h3>": 
          $errtextgrids!~/\S/? "<h3>No errors found in TEXTGRID</h3>":
          "<p>\n<hr><b>Errors found in TEXTGRID</b> <p> <table border=1>".
          "<tr><th>interval</th><th>errortype</th>".
          "<th>instance</th><th>field</th></td>$errtextgrids\n".
	  "</table>\n<hr>";

$report .= length($res{'MANUS'})==0? "<h3>No MANUS uploaded</h3>": 
          $errmanus!~/\S/ && %lexforms? 
            "<h3>All words in MANUS were recognized</h3>":
          $errmanus!~/\S/? "<h3>MANUS uploaded, but no lexicon</h3>":
          "<p>\n<hr><b>MANUS has unrecognized words:</b> <p> $errmanus\n<hr>";

if (scalar(@PoS)==0) { $report .=  "<h3>No PAROLE uploaded</h3>" };

if (scalar(@sampa)) {
  if (length($res{'LEX'})) {
    $report .= length($sampa0) && $entc>0?
     "<p>\n<b>SAMPA phones not used in LEXICON:</b> <pre>$sampa0</pre><hr>":
       $entc>0? "<b>All ".scalar(@sampa)." SAMPA entries used in LEXICON<p><br>":
      "";
  };

  $report .= length($sampa00) && length($res{'TEXTGRID'})?
   "<p>\n<b>SAMPA phones not used in TEXTGRID:</b> <pre>$sampa00</pre><hr>":
    length($res{'TEXTGRID'})? "<b>All ".scalar(@sampa).
      " SAMPA entries used in TEXTGRID<p><br>":
   "";

  if (length($res{'LEX'})) {
    $report .= length($sampa000) && length($res{'MANUS'})?
     "<p>\n<b>SAMPA phones not used in MANUS:</b> <pre>$sampa000</pre><hr>":
      length($res{'MANUS'})? "<b>All ".scalar(@sampa).
        " SAMPA entries used in TEXTGRID<p><br>":
     "";
  };
}
else { $report .=  "<h3>No SAMPA uploaded</h3>" }; 

print <<RAVNRAVN
Content-type: text/html

<html>
<head>
</head>

<body bgcolor=#fff7ba>

<h1>RAVN resource checker</h1>
<i>Updated 7.1.2021</i>

<hr>
<b>Evaluating lexicon, SAMPA, TextGrids and/or Manus for consistency<p>
NB! Gold-lexicons are auto-detected, but not thoroughly PoS-checked wrt. PoS-inventory
</b>
<p>

<b>Accepted graphemes: <code>$GoodGraphs</code></b>

$txtbody

$report
</body>
<hr>
</html>

RAVNRAVN
;

# ---------------------- subs --------------------

sub tabfy { "<tr><td>".join("</td><td>",@_)."</td></tr>\n" };
sub lowerchar {  $_=lc($_[0]); tr///; $_};

# ---------------------- The end --------------------------


