#!/usr/bin/perl

# MakeLemma_1_6.cgi (021220) - cgi-version

# ----------------- SETTINGS ------------------

$GoodGraphs = 'a-zA-Z\.\-\'\_0123456789';
$GoodPoS = 'A-Z1-3\[\]\.\=\-';
$GoodPhon = '234589ACDEHIJLMNORSUWXYZabdefghijklmnoprstuvwxyz\!\:\~\%'; 

$url = "https://lab.homunculus.dk/cgi-bin/Ravn/MakeLemma_1_6.cgi";

$priobonus = 1000;

# -------------- INPUT ---------------- 

($Q = $ENV{'QUERY_STRING'}) =~ s/\%(..)/pack('C',hex($1))/eg;
($TIME) = $Q=~/\bT=(\d+)/;

read(STDIN,$datax,$ENV{'CONTENT_LENGTH'});
($bound) = $datax=~/^(--+\S+)/m;

for (split /$bound\r?\n?/,$datax) {
  /\bRAVNORTO\b\W*?([$GoodGraphs]+)/s and $ortoI=$1 and next;
  /\bRAVNPPOS\b\W*?([$GoodPoS]+)/ and $pposI=$1 and next;
  /\bRAVNPHON\b\W*?([$GoodPhon]+)/ and $phonI=$1 and next;
  /\bRAVNPRIO\b\W*?([;$GoodGraphs]+)/
    and $ortoPRIO=' '.join(' ',split(/;+/,$1)).' ' and next;

  /\bRAVNCLEM\b\W*?(\d+)/ and $clem=$1 and next;

  /\bRAVNOBSOLETE\b\W*?(yes|no)/ and $obsolete=$1 and next;
  /\bRAVNFOLD\b\W*?(yes|no)/ and $dofold=$1 and next;
  /\bRAVNRECYCDICT\b\W*?(yes|no)/ and $recycdict=$1 and next;

  $t=$_;  $t=~s/^\r?\n//; $t=~s/^\r?\n//;
  ($res) = $t=~/\bname=['"]?RAVN([A-Z]+)/;
  $t=~s!^content-.+\r?\n!!gim;
  if ( $res eq 'GOLDDICT' ) { $dict = $t };
};

if ($recycdict eq 'yes') {
  $res{'GOLDDICT'} = `cat /tmp/RAVN762_GOLDDICT`;
} elsif ($dict=~/\S/) {
  $res{'GOLDDICT'}=$dict;
  open(RES,">/tmp/RAVN762_GOLDDICT"); print RES $dict; close(RES);
}
  
$clem||=3;
$focuswd = "$ortoI,$pposI,$phonI";
$error.="<li>Bad input ($focuswd)</li>\n" unless $ortoI and $pposI and $phonI;
$prioMin=2;

# -------------- LEG ---------------- 


if (0) {    #   1: test offline;   0: full cgi

    # NB!!!  Hent sektionen i MakeLemma_1_5.cgi - hvis aktuelt...
  
}

# -------------- Compose lemma suggestions ---------------- 

$ORTO=$PPOS=$PHON=''; $ortoF=$pposF=$phonF=''; $lexflag=$lemmaflag=0;

$REp = ' '.join(' ',&unfold($pposI));

for my $lin (split /\n/,$res{'GOLDDICT'}) {
  next unless $lin=~/^ORTO:/ or $lin=~/^---/;    # NB! not much error control!
  next if $obsolete ne 'yes' and $lin=~/\tPPOS:\S+O\t/; # ignore obsolete forms
  chomp($rawline=$lin);
  if ($lin=~/^---/) {      # lemma separator: time to process lemma
    if ($pposF) {          # does lemma's PoS conform?
      ($comInLexO,$comInLexP) = (&comX("$ortoF $ortoI"),&comX("$phonF $phonI"));
      $prio = &prio($comInLexO,$comInLexP,$ortoF,$ortoI,$phonF,$phonI);
      if ($prio>=$prioMin) {  # is O/P similarity within limits?

        ($comO,$comP) = ( &com(join ' ',@ORTO), &com(join ' ',@PHON) );
        ($dLexO=$ortoF) =~ s/$comInLexO$//;  ($dLexP=$phonF) =~ s/$comInLexP$//;
        ($dInO=$ortoI)  =~ s/$comInLexO$//;  ($dInP=$phonI)  =~ s/$comInLexP$//;
        push(@lemma,[$prio,$Fix,[@ORTO],[@PPOS],[@PHON],$dInO,$dInP,$dLexO,$dLexP])
          if $comO=~/^$dLexO/ and $comP=~/^$dLexP/;      # is O/P map possible?
      }
    };
    @ORTO=@PPOS=@PHON=(); $ortoF=$pposF=$phonF=$Fix='';

  } else {                           # word form: store and continue
    @lin = split /\t/,$lin;
    ($orto) = $lin[0]=~/^ORTO:([$GoodGraphs]+)$/
      or $error.="<li>Bad ORTO in lex ($rawline)</li>\n";
    ($ppos) = $lin[1]=~/^PPOS:([$GoodPoS]+)$/
      or $error.="<li>Bad PPOS in lex ($rawline)</li>\n";
    ($phon) = $lin[2]=~/^PHON:([$GoodPhon]+)$/
      or $error.="<li>Bad PHON in lex ($rawline)</li>\n";
    $lexflag=1;
    push @ORTO,$orto; push @PPOS,$ppos; push @PHON,$phon;
    ($ortoF,$pposF,$phonF,$Fix) = ($orto,$ppos,$phon,$#PPOS) if $REp=~/ $ppos/;
  };
};

# -------------- select lemma suggestions, compose message ---------------- 

$error .= "<li>No lexical entries found</li>\n" unless $lexflag;
$error .= "<li>No relevant lemmas found in lex</li>\n" unless scalar(@lemma);

$lem_shown=0;
for my $lix ( map( \@{$_}, sort {@{$b}[0] <=> @{$a}[0]} @lemma) ) {
  ($prio,$Fix,$refO,$refPoS,$refP,$dInO,$dInP,$dLexO,$dLexP) = @{$lix};
  @ORTO=@{$refO}; @PPOS=@{$refPoS}; @PHON=@{$refP};
  @out=(); $olem=$ORTO[0]; $plem=$PHON[0];
  for my $ix (0..$#ORTO) {
    $ORTO[$ix] =~ s/^$dLexO/$dInO/; $PHON[$ix] =~ s/^$dLexP/$dInP/;
    $PPOS[$ix] = $pposI if $ix==$Fix and length($PPOS[$ix])<length($pposI);
    $PPOS[$ix] =~ s/\[(\S*?)\]/ '['.join('',sort &uniq(split '',$1)).']' /eg;
    push(@out,"ORTO:$ORTO[$ix]\tPPOS:$PPOS[$ix]\tPHON:$PHON[$ix]");
  };
  @out = &foldLemma(@out) if $dofold eq 'yes';
  $out[$Fix] = "<b>$out[$Fix]</b>";
  next if $seen{$out = join("\n",@out)}++; $lem_shown++;
  $OUT .= "--- LEMMA:".++$lemix." FIT=$prio ".
          "($olem,".&com(join(" ",@PPOS))."+,$plem) ---\n$out\n";

  ($tabout=$out) =~ s!\n+!</td></tr>\n<tr><td>\n!g; $tabout=~s!\t!</td><td>!g;
  $tableix++; 
  $TABOUT .= "<tr><td align=center><b>LEMMA $tableix</b></td></tr>\n
              <tr><td>$tabout</td></tr>\n";

  last if ++$cout >= $clem;
};

$OUT .= "---\n";
$TABOUT = "<table border=1 cellpadding=4 cellspacing=0>\n$TABOUT\n</table>\n";

$error .= "<li>Last lemma is incomplete (remember final '---'!)</li>\n"
  if $ORTO or $PPOS or $PHON;

$errors = "<h2>Errors found</h2><ul>$error</ul><br><hr>" if $error and $TIME;

$lem_found = 0 + scalar @lemma;
$stat = "<i>Input</i>:<code> '$ortoI' |$pposI| [$phonI]</code><br>".
        ($ortoPRIO? "<i>Prio.</i>:<code>$ortoPRIO</code><br>": ''). 
        "<i>Lemmas</i>: $lem_found found, $lem_shown shown<hr>" if $TIME; 
 
# -------------- compose <form> element ---------------- 

$T = time; $c='checked';
($obs1,$obs2) =
  $obsolete eq 'yes'? ($c,''): $obsolete eq 'no'? ('',$c): ('','');
($dofold1,$dofold2) =
  $dofold eq 'yes'? ($c,''): $dofold eq 'no'? ('',$c): ('','');
($recyc1,$recyc2) =
  $recycdict eq 'yes'? ($c,''): $recycdict eq 'no'? ('',$c): ('','');

$htmlform = <<RAVNEKROG



<form action="$url?T=$T" method="POST" enctype="multipart/form-data">

  Dictionary: <input type="file" name="RAVNGOLDDICT" accept=".csv">
  <p>

  ORTO: <input type="text" name="RAVNORTO" size=20><br>
  PPOS: <input type="text" name="RAVNPPOS" size=12><br>
  PHON: <input type="text" name="RAVNPHON" size=20><p>
  PRIO: <input type="text" name="RAVNPRIO" size=20><p>

  Lemmas in output: <i>max.</i><input type="text" name="RAVNCLEM" size=1 value="$clem">

  <p>
  Use Obsolete? 
    <i>yes</i><input type="radio" name="RAVNOBSOLETE" value="yes" $obs1>
    <i>no</i><input type="radio" name="RAVNOBSOLETE" value="no" $obs2>

  <br>
  Fold PoS? &nbsp; &nbsp;
    <i>yes</i><input type="radio" name="RAVNFOLD" value="yes" $dofold1>
    <i>no</i><input type="radio" name="RAVNFOLD" value="no" $dofold2>

  <br>
  Recycle Dict?
    <i>yes</i><input type="radio" name="RAVNRECYCDICT" value="yes" $recyc1>
    <i>no</i><input type="radio" name="RAVNRECYCDICT" value="no" $recyc2>

  <p>
  <input value="Make lemma" type=submit>

</form>
RAVNEKROG
;

# ----------------------- output --------------------

print <<RAVNRAVN
Content-type: text/html

<html>
<head>
</head>

<body bgcolor=#fffacc>

<h1>RAVN wordform-to-lemma</h1>
<i>Updated 2.12.2020</i>
<p>
<ol>

<li>Select a GOLD dictionary 
(e.g. <code>ASU_050520_guldordbog.csv</code>)</li>

<li>Insert a lexical entry
  (e.g. '<code>drongurin</code>', '<code>NCMSN==DU</code>', 
        '<code>dr%ONgUrIn</code>')</li>

<li>If needed, insert prioritized lexical form(s)
  (e.g. '<code>koppurin</code>' <i>or</i> 
        '<code>koppurin;risin;beiggin</code>')</li>

<li>Set options 
(Use obsolete forms? Fold PoS when possible? Recycle dictionary?)</li> 

<li>Press<code> <b>Make lemma</b></code></li>
</ol>

<p>

<table border=1 cellpadding=20>

<tr>
  <td valign=top>$errors  <code>$htmlform</code></td>
  <td valign=top>$stat<pre>$OUT</pre></td>
  <td valign=top><i>Lemmas as table:</i>\n<pre>$TABOUT</pre>\n</td>
</tr>
</table>

</body>
</html>
RAVNRAVN
;

# ----------------------- subs --------------------

sub com {$_=$_[0]; s!\S+!\\1\\S*!g; s!\\1!(\\S+)!; $_[0]=~/^$_/; $1};
# e.g. in = (' abc abcde','','abx  '); out = 'ab'  (space chars OK everywhere)

sub comX {$_=$_[0]; s!\S+!\\S*?\\1!g; s!\\1!(\\S+)!; $_[0]=~/$_$/; $1};
# As &com() reversed: in = (' abc','aebc xbc ',''); out = 'bc'  (space chars OK)

sub dist { " $_[0]" =~ / $_[1](\S*)/g };
# e.g. in = ('abc abcde abx','ab'); out = ('c','cde','x')

sub unfold {
  $_[0]=~/(.*?)\[([^\[\]]+)\](.*)/ or return $_[0];
  my(@x);
  for my $f ( map("$1$_$3", sort &uniq(split '',$2) ) ) {push(@x,&unfold($f))};
@x};
# e.g. in = 'VAPR-[SP][MFN][ID][ARU]-[NADG]U' - out = list of all 144 forms

sub prio {
  my($comInLexO,$comInLexP,$ortoF,$ortoI,$phonF,$phonI) = @_;

  $ortoF=~/$ortoI/ + $phonF=~/$phonI/ +
  $ortoI=~/$ortoF/ + $phonI=~/$phonF/ +
  $priobonus*$ortoPRIO=~/ $ortoF / +
  length($comInLexO) + length($comInLexP)
}

sub foldLemma {  # in: list of dict entries; out: same, but folded
my(@lem)=@_; my($ix,$x,$a1,$a2,$pos1,$pos2,$b1,$b2);
for (;;) {
  ($a1,$pos1,$b1) = $lem[$ix]   =~ /^(ORTO:\S+\tPPOS:)(\S+)(\tPHON:.+)$/;
  ($a2,$pos2,$b2) = $lem[$ix+1] =~ /^(ORTO:\S+\tPPOS:)(\S+)(\tPHON:.+)$/;
  if ( $a1 eq $a2 and $b1 eq $b2 and $x=&fold2($pos1,$pos2) ) {
  @lem=(@lem[0..$ix-1],"$a1$x$b1",@lem[$ix+2..$#lem]); $ix -= $ix>0
    } else {$ix++};
    last if $ix>=$#lem;
  };
@lem}

sub fold2 {
  my($o,$err,$c,@a,@b);
  return $_[0] if $_[1]=~/^$_[0]$/; return $_[1] if $_[0]=~/$_[1]$/;

  @a = map(chop eq '['? split '': &ustr($_), "$_[0]\[" =~ /([^\[\]]+[\[\]])/g );
  @b = map(chop eq '['? split '': &ustr($_), "$_[1]\[" =~ /([^\[\]]+[\[\]])/g );
  return '' if $#a!=$#b;

  for my $i (0..$#a) {
    if ($a[$i] eq $b[$i]) {$o .= length($a[$i])>1? "[$a[$i]]": $a[$i]}
    else { return '' if $err++; $o .= '['.&ustr($a[$i].$b[$i]).']' }
  };

$o};

sub uniq {my(%s,@l); for (@_) { push(@l,$_) if !$s{$_}++ }; @l };
sub ustr {$_ = join('',sort split('',$_[0])); s/(.)\1*/$1/g; $_};

# ----------------------- The end --------------------


