#! /bin/sh
LANG=C
export LANG
( cd ../../ud-kanbun/Pulleyblank
  git pull
)
cp /dev/null dict.gloss.csv
cp /dev/null dict.yasuoka.csv
sort +1 gloss.orig.txt -o gloss.orig.txt
nawk '
NF==4{
  if($4~/[*?]/)
    printf("%s,0,0,0,%s,*,*,%s,*,*,*\n",$1,$3,$2)|"mc2ud.nawk > dict.yasuoka.csv";
  else
    printf("%s,0,0,0,%s,*,*,%s,*,*,%s\n",$1,$3,$2,$4)|"mc2ud.nawk > dict.gloss.csv";
}
NF==5{
  if($5~/[*?]/)
    printf("%s,0,0,0,%s,*,%s,%s,*,*,*\n",$1,$4,$3,$2)|"mc2ud.nawk > dict.yasuoka.csv";
  else
    printf("%s,0,0,0,%s,*,%s,%s,*,*,%s\n",$1,$4,$3,$2,$5)|"mc2ud.nawk > dict.gloss.csv";
}' gloss.orig.txt
for F in corpus.pulleyblank.mc corpus.mencius.mc corpus.lunyu.mc corpus.liji.mc corpus.18shilue.mc corpus.center.mc
do ( case $F in
     corpus.mencius.mc) cd ../../ud-kanbun/kanripo/kR1h0001
	cat */*.txt ;;
     corpus.lunyu.mc) cd ../../ud-kanbun/kanripo/kR1h0004
	cat */*.txt ;;
     corpus.liji.mc) cd ../../ud-kanbun/kanripo/kR1d0052
	cat */*.txt ;;
     corpus.18shilue.mc) cd ../../ud-kanbun/18shilue
	cat 00?/*.txt 01[0-8]/*.txt ;;
     corpus.center.mc) cd ../../ud-kanbun/center-exam
        cat */*.txt ;;
     corpus.pulleyblank.mc) cd ../../ud-kanbun/Pulleyblank
	for F in Pulleyblank*.txt
	do set x `egrep '^# text = ' $F | wc`
	   nawk '
BEGIN{
  n='${2-1}';
}
{
  if($0~/^# text = /){
    if(n<2)
      printf("%s。\n",$0);
    else
      printf("%s、\n",$0);
    n--;
    root=0;
  }
  else if($0==""){
    id++;
    if(n<1)
      printf("%d\t。\t。\tPUNCT\ts,記号,句点,*\t_\t%d\tpunct\t_\tSpaceAfter=No\n\n",id,root);
    else
      printf("%d\t、\t、\tPUNCT\ts,記号,読点,*\t_\t%d\tpunct\t_\tSpaceAfter=No\n\n",id,root);
    root=0;
  }
  else{
    printf("%s\n",$0);
    id=$1;
    if($8=="root")
      root=id;
  }
}' $F
	done ;;
     esac
   ) | nawk '
/^[^#]/{
  if($0==""){
    if(n>0)
      printf("EOS\n");
    n=0;
  }
  else{
    if(n>=$1)
      printf("EOS\n");
    n=$1;
    if(match($10,/Gloss=[^|]+/)>0)
      g=substr($10,RSTART+6,RLENGTH-6);
    else
      g="*";
    printf("%s\t%s,*,*,%s,*,*,%s\n",$2,$5,$3,g);
  }
}
END{
  if(n>0)
    printf("EOS\n");
}' | mc2ud.nawk > $F
done

for F in corpus.KT0.mc corpus.kanjikai.mc corpus.misc.mc
do nawk '
{
  if($1=="EOS")
    printf("EOS\n");
  else{
    split($2,a,",");
    printf("%s\t%s,%s,%s,%s,*,*,%s,*,*,*\n",$1,a[1],a[2],a[3],a[4],a[7]);
  }
}' ../seed_names2/$F | mc2ud.nawk > $F
done

( sed -e /EOS/d -e 's/	/,0,0,0,/' corpus.pulleyblank.mc corpus.mencius.mc corpus.lunyu.mc corpus.liji.mc corpus.18shilue.mc corpus.center.mc corpus.KT0.mc corpus.kanjikai.mc corpus.misc.mc
  sed 's/,[^,]*,[^,]*,[^,]*$/,*,*,*/' ../seed_names2/KTp2.*.csv ../seed_names2/name.Noun.girei.csv 
) | sort -u | tr , ' ' | nawk '
BEGIN{
  c=sprintf("cat dict.gloss.csv dict.yasuoka.csv | tr , %c %c",39,39);
  while((c|getline)>0)
    w[$1]=1;
  close(c);
}
length($1)==3{
  if(w[$1]!=1)
    printf("%s\n",$0);
}' | tr ' ' , | simplify.py | mc2ud.nawk | sort -u > dict.KTp2.csv
sed 's/,[^,]*,[^,]*,[^,]*$/,*,*,*/' ../seed_names2/name.Noun.personal.csv ../seed_names2/name.Noun.surname.csv ../seed_names2/KTp2.Noun.place.?.csv | sort -u | awk -F, '
BEGIN{
  c=sprintf("cat dict.gloss.csv dict.yasuoka.csv | tr %c\\011%c ,",39,39);
  while((c|getline)>0)
    w[$1,$5,$6,$7,$8]=1;
  close(c);
  m=sprintf("%c%c",240,175);
}
{
  s=$14;
  if($5=="n"&&$6=="名詞"){
    if($7=="人"&&$8=="姓氏")
      s="[surname]";
    else if($7=="人"&&$8=="名")
      s="[given-name]";
    else if($7=="主体"&&$8=="書物")
      s="[book-name]";
    else if($7=="主体"&&$8=="国名")
      s="[country-name]";
    else if($7=="固定物"&&$8=="地名"){
      if(length($1)>7){
	if($1~/(府|縣|郡)$/)
	  next;
      }
      else if($1~/(海|海)$/)
        next;
      s="[place-name]";
    }
  }
  if(s!="*"&&$11!="*"&&index($1,m)<1&&index($11,m)<1&&w[$1,$5,$6,$7,$8]!=1)
    printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,s);
}' | simplify.py | mc2ud.nawk | sort -u > dict.name.csv
sed 's/,[^,]*,[^,]*,[^,]*$/,*,*,*/' ../seed_names2/Symbol.csv | sort -u | mc2ud.nawk > dict.symbol.csv
if [ ! -s dict.yasuoka.csv ]
then /bin/rm -f dict.yasuoka.csv
fi
exit 0
