#!/usr/bin/perl

# MakeWdList_1_2.cgi - ver 170321

# ---------------- SETTINGS ------------------

$RemoveFromPhon = "\"\'\!\%\~";
$GoodGraphs = 'a-zA-Z_';
$GoodPhones = ':023456789aAbcCdDeEfghHiIjJklLmMnNoOpqQrRsStTuUvwWxXyYzZ';
$KeepTogether = 'AJW:';
@PoS = ('N','V','Adj','Adv','Num','Prep','Pron','Konj','Art','Int');
%PoS = map( ($_=>1), @PoS );

# ($clusterRE,$clusterS) = ( ' ([AJW:])', '\\$1' );

$AtomsForPrint = '['.join('][',@TreatAsAtom).']';
for (@TreatAsAtom) {s/[$RemoveFromPhon]//g; $AtomRE.="$_|"}; chop($AtomRE);

# -------------- INPUT ---------------- 

read(STDIN,$datax,%ENV{'CONTENT_LENGTH'});
($minl,$maxl) = (4,12);

$minl = $1 if $datax=~/MINL358\W+(\d+)/s;
$maxl = $1 if $datax=~/MAXL358\W+(\d+)/s;

# -------------- MAIN ---------------- 

$flag=0; $GOLDflag=0;

for (split /\n+/,$datax) {
  $GOLDflag ||= /^ORTO:/;
  last if $flag and /WebKitFormBoundary/i;
  next if /^-/ or /^Content-/i or !/\S/;
  ($o,$p) = 
    $GOLDflag? /^ORTO:(\S{$minl,$maxl})\t+PPOS:\S*\tPHON:(\S+)/:
               /^(\S{$minl,$maxl})\s.*?\[([^\[\]\s]+)\][^\[\]]*$/;
  next unless $o;                             # words with single phform only
  $p =~ s/[$RemoveFromPhon]//g;
  $ent = "$o\t[$p]\n";
  $rejected .= $ent and next
    if $o!~/^[$GoodGraphs]+$/ or $p!~/^[$GoodPhones]+$/;
  $p = join ' ',split('',$p); $p =~ s/ ([$KeepTogether])/$1/g; # dipht handling
  %p=(); for ($p=~/(\S+)/g) {$p{$_}=1; $ptot{$_}=1};
  push(@buf, "$o\t[".join(' ',sort keys %p)."]" ); 
  $validc++;  # valid forms count
  $flag=1;
};

@buf = map( /^\S+\t(.+)/g , sort map(rand."\t$_",@buf) );  # Randomize

while (%ptot) {
  $maxcover=0;
  for $ent (@buf) {
    ($o,$p) = $ent=~/^(\S+)\t\[(.*)\]/;
    $cover = 0; for ($p=~/(\S+)/g) {$cover += $ptot{$_}};
    if ($cover>$maxcover) {$maxcover=$cover; $maxp = $p; $maxo=$o};
  };
  $iter++;
  $wdlist .= "<tr><td>#$iter</td>".
             "<td>$maxo</td><td><code>$maxp</code></td><td>$maxcover</td>".
             "<td><code>".join(' ',sort keys %ptot)."</code></td></tr>\n";
  push @wds0, length($maxo)."\t$maxo";
  for ($maxp=~/(\S+)/g) {delete $ptot{$_}};
  last unless $maxcover;
};

$wds = join "<br>\n",map(/\t(\S+)/g ,sort {$a<=>$b} @wds0);

# -------------- OUTPUT ---------------- 

$txt1 = <<RAVNEMOR
<hr>
<form action="https://lab.homunculus.dk/cgi-bin/Ravn/MakeWdList_1_2.cgi" 
      method=POST enctype="multipart/form-data">
  <p>
  Word length (min and max):
  <input size=2 value="$minl" name="MINL358" type=text>
  -
  <input size=2 value="$maxl" name="MAXL358" type=text>

 <p>

  <input id="filhejs" value="Din fil" type="file" name="dinfil" accept=".csv">

  <p>
  <input value="Upload lexicon" name="dit ord" type=submit>

</form>

<b>OBS! Large lexicons make take a while to upload...</b>

RAVNEMOR
;

$txt2 = <<RAVNEFAR
<hr>
<b>READING LIST (count $iter, length $minl-$maxl, sorting short-long,
  parsed-as-gold: $GOLDflag)<p>$wds</b>
<hr>
<p>
<b>PROCESS LOG (word length $minl-$maxl):</b>
<p>
<table border=1>
<tr><th>iter</th><th>word</th><th>phones</th><th>cover</th><th>residual</th></tr>
$wdlist
</table>

<hr>
<br>
<b>GOOD ENTRIES (count): $validc</b><br> 
<b>BAD ENTRIES (listing):</b><br><pre>$rejected</pre> 
<br>
RAVNEFAR
;

$formsection = $validc? $txt2: $txt1;

print <<RAVNRAVN
Content-type: text/html

<html>
<head>
</head>

<body bgcolor=#ccffff>

<h1>Word list generator</h1>
<i>Updated 17.3.2021</i> 
<hr>

<pre>
IGNORED PHONE SYMBOLS:  [$RemoveFromPhon]
ACCEPTED PHONE SYMBOLS: [$GoodPhones]
TREATED AS POSTFIXED:   [$KeepTogether]
ACCEPTED GRAPHEMES:     [$GoodGraphs]
</pre>

<p>
$formsection
<hr>
</body>
</html>

RAVNRAVN
;



