#!/usr/bin/perl

# til brug for nodalida19/Ravn - ^`~
#  ver 200519 - modified 170321

# CALL: perl MakePhoneClusters_1_2.cgi

# ----------------- settings -------------

$lim = 3;   # DEFAULT value: only var-rules with c>=$lim are reported

# -------------- INPUT ---------------- 

read(STDIN,$datax,%ENV{'CONTENT_LENGTH'});

($bound) = $datax=~/^(--+\S+)/;

for (split /$bound\r?\n?/,$datax) {
  $t=$_;  $t=~s/^\r?\n//; $t=~s/^\r?\n//;
  ($res) = $t=~/\bname=['"]?RAVN([A-Z]+)/;
  $t=~s!^content-.+\r?\n!!gim;
  $res{$res} = $t;
};

$GOLDflag=0;
for (split /\n/, $res{'LEX'}) {
  $GOLDflag ||= /^ORTO:/i;
  ($o5,$p5) = $GOLDflag?
     /^ORTO:(\S+)\tPPOS:\S*\tPHON:(\S+)/i: 
     /^([^#]\S*)\s+\[(.*?)\]/;
  $o2p{ lc $o5 }=$p5 if $o5;
};
($rulelimit) = $res{'RULELIMIT'}=~/^(\d+)\r?$/m; $rulelimit||=$lim;

# ------------ parse dic and textgrids ------------

$lines = $res{'TEXTGRID'};

for (split /\n+/,$lines) {
  /^\s+name\s*=.+(SAMPA|orthography|SPK0\|t\|v)/i and $tier=$1 and next;
  /^xmax\s*=\s*([\d\.]+) / and $tix=$1 and next;
  /^         +xmin\s*=\s*([\d\.]+) / and $xmin=$1 and next;
  /^         +xmax\s*=\s*([\d\.]+) / and $xmax=$1 and next;
  /^         +text\s*=\s*"(\S.*) " \r?$/ and
                 $seg{$tix}{"$xmin-$xmax"}[$tier eq 'SAMPA'? 1: 0]=$1 and ++$cseg;

};

# ------------ main ------------

for $name (keys %seg) { for $xm (sort keys %{$seg{$name}}) {
  ($oseg,$pseg) = ( $seg{$name}{$xm}[0], $seg{$name}{$xm}[1] ); 

$pseg=&normp($pseg); 

  next if $oseg!~/\S/ or $pseg!~/\S/; $csyncseg++; 
  $pseg=~s/[ ":]//g;  # remove diacritics and spurious space chars from p-form
  $flag=0; $plex='';
  for $o (split /\s+/,$oseg) {
    $p = &normp($o2p{lc $o}); if ($p=~/\S/) {$plex.=$p} else {$flag=1}; 
};
  next if $flag or $plex!~/\S/; # die "$flag\t$plex\t$pseg\n";

  ($fit_,$xA_,$xB_) = &alignp($plex,$pseg);
  @xA=@{$xA_}; @xB=@{$xB_}; @fit=@{$fit_};
  &printfit($fit_,$xA_,$xB_,$plex,$pseg);
  for (0..$#xA) {$mis{"@{$xA[$_]} -> @{$xB[$_]}"}++; $cmis++};
  $cfit+= scalar @fit;
}};

for (sort keys %mis) {push @c,"$mis{$_}\t($_)\n"};
for (sort {$b<=>$a} @c) {
  my($c); ($c)=/^(\d+)/; $c>0 and ++$RC; $c>=$rulelimit and ++$RCq and $OUTmis .= "$_"
};

# ---------------------- write output --------------------

print <<RAVNRAVN
Content-type: text/html

<html>
<head>
</head>

<body bgcolor=#eeeebb>

<h1>RAVN pronunciation variation mapper</h1>
<i>Updated 17.3.2021</i>

<hr>
<b>Compares lexical and descriptive phonetic forms</b>,<br>
reporting results as (i) frequency-sorted 
transduction rules SAMPA&lt;=&gt;SAMPA, (ii) word-by-word analyses.

<hr>
<form action="https://lab.homunculus.dk/cgi-bin/Ravn/MakePhoneClusters_1_2.cgi" 
      method=POST enctype="multipart/form-data">
  <p>

  Lexicon ('fuldformsordbog' or 'guldordbog') as .csv <br>
 <input type="file" name="RAVNLEX" accept=".csv">

<p>

  Transcription data as .textGrid (or a concatenation of several .textGrid<i>s</i>) <br>
  <input type="file" name="RAVNTEXTGRID" accept=".textGrid">

<p>
  Transduction rules reported must have at least <input type="text" size=2
    value=$rulelimit name="RAVNRULELIMIT"> instancies<br>
  <input value="Upload resources" type=submit>

</form>

<hr>

<b>Transduction rules (total=$RC, in range=$RCq, parsed-as-gold=$GOLDflag)</b>
<pre>$OUTmis</pre><hr>
<b>Lexeme analyses</b>
<pre>$OUT</pre><hr>

</body>
</html>

RAVNRAVN
;

# ------------ subs ------------

sub normp {$_=$_[0]; s/[ ":]//g; $_};   # normalize phons for comparison

sub alignp {    # NB! both args must be length>0
  my(@a,@b);
   @a=split '',$_[0]; @b=split '',$_[1]; 
  my(@fit,@xA,@xB,$i,$j); $i=$j=0; 

  for (;;) {
    if ($i<=$#a and $j<=$#b and $a[$i] eq $b[$j]) {
      push(@fit,$a[$i]); $i++; $j++ 
    }
    elsif ($i<$#a and $j<$#b and $a[$i].$a[$i+1] eq $b[$j+1].$b[$j]) {
      push(@xA,[@a[$i,$i+1]]); push(@xB,[@b[$j,$j+1]]); $i+=2; $j+=2      
    }
    elsif ($i<$#a and $j<=$#b and $a[$i+1] eq $b[$j]) {
      push(@xA,[@a[$i]]); push(@xB,[]); $i++
    }
    elsif ($i<=$#a and $j<$#b and $a[$i] eq $b[$j+1]) {
      push(@xA,[]); push(@xB,[@b[$j]]); $j++
    }
    elsif ($i<$#a and $j<$#b and $a[$i+1] eq $b[$j+1]) {
      push(@xA,[@a[$i]]); push(@xB,[@b[$j]]); $i++; $j++
    }
    elsif ($i<$#a-1 and $j<=$#b and $a[$i+2] eq $b[$j]) {
      push(@xA,[@a[$i,$i+1]]); push(@xB,[]); $i+=2
    }
    elsif ($i<=$#a and $j<$#b-1 and $a[$i] eq $b[$j+2]) {
      push(@xA,[]); push(@xB,[@b[$j,$j+1]]); $j+=2
    }
    elsif ($i<$#a and $j<$#b-1 and $a[$i+1] eq $b[$j+2]) {
      push(@xA,[@a[$i]]); push(@xB,[@b[$j,$j+1]]); $j+=2; $i++
    }
    elsif ($i<$#a-1 and $j<$#b and $a[$i+2] eq $b[$j+1]) {
      push(@xA,[@a[$i,$i+1]]); push(@xB,[@b[$j]]); $i+=2; $j++
    }
    elsif ($i<$#a-2 and $j<=$#b and $a[$i+3] eq $b[$j]) {
      push(@xA,[@a[$i..$i+2]]); push(@xB,[]); $i+=3
    }
    elsif ($i<=$#a and $j<$#b-2 and $a[$i] eq $b[$j+3]) {
      push(@xA,[]); push(@xB,[@b[$j..$j+2]]); $j+=3
    }
    elsif ($i<$#a-2 and $j<$#b and $a[$i+3] eq $b[$j+1]) {
      push(@xA,[@a[$i..$i+2]]); push(@xB,[@b[$j]]); $i+=3; $j++
    }
    elsif ($j<$#b-2 and $i<$#a and $a[$i+1] eq $b[$j+3]) {
      push(@xA,[@a[$i]]); push(@xB,[@b[$j..$j+2]]); $j+=3; $i++
    }
    elsif ($i<=$#a and $j<=$#b) {
      push(@xA,[@a[$i]]); push(@xB,[@b[$j]]); $i++; $j++
    }
    else {
      push(@xA,[@a[$i..$#a]]) and push(@xB,[]) if $i<=$#a; 
      push(@xB,[@b[$j..$#b]]) and push(@xA,[]) if $j<=$#b; 
      last
    };
  };
  
([@fit],[@xA],[@xB])
};

sub printfit {
  my($fit_,$xA_,$xB_,$plex,$pseg) =  @_; my(@xA,@xB,@fit,$mis);
  @xA=@{$xA_}; @xB=@{$xB_}; @fit=@{$fit_};
    die "$#xA!=$#xB!" unless  $#xA==$#xB;
  $mis="";
  for (0..$#xA) {
    $mis.="(".join("-",@{$xA[$_]})." -> ".join("-",@{$xB[$_]}).") "
  }
  $OUT .= "[$plex]\t[$pseg]\tFIT=@fit\t$mis\n";

};

# ------------ the end ------------


