Skip to content
Snippets Groups Projects
makedict.sh 4.18 KiB
Newer Older
  • Learn to ignore specific revisions
  • #! /bin/sh
    LANG=C
    export LANG
    
    ( cd ../../ud-kanbun/Pulleyblank
      git pull
    )
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    cp /dev/null dict.gloss.csv
    cp /dev/null dict.yasuoka.csv
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    sort +1 gloss.orig.txt -o gloss.orig.txt
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    nawk '
    NF==4{
      if($4~/[*?]/)
    
        printf("%s,0,0,0,%s,*,*,%s,*,*,*\n",$1,$3,$2)|"mc2ud.nawk > dict.yasuoka.csv";
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
      else
    
        printf("%s,0,0,0,%s,*,*,%s,*,*,%s\n",$1,$3,$2,$4)|"mc2ud.nawk > dict.gloss.csv";
    
    }
    NF==5{
      if($5~/[*?]/)
    
        printf("%s,0,0,0,%s,*,*,%s/%s,*,*,*\n",$1,$4,$2,$3)|"mc2ud.nawk > dict.yasuoka.csv";
    
      else
    
        printf("%s,0,0,0,%s,*,*,%s/%s,*,*,%s\n",$1,$4,$2,$3,$5)|"mc2ud.nawk > dict.gloss.csv";
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    }' gloss.orig.txt
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    for F in corpus.pulleyblank.mc corpus.mencius.mc corpus.lunyu.mc corpus.liji.mc corpus.18shilue.mc corpus.chuci.mc corpus.center.mc
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    do ( case $F in
         corpus.mencius.mc) cd ../../ud-kanbun/kanripo/kR1h0001
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    	cat */*.txt ;;
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
         corpus.lunyu.mc) cd ../../ud-kanbun/kanripo/kR1h0004
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    	cat */*.txt ;;
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
         corpus.liji.mc) cd ../../ud-kanbun/kanripo/kR1d0052
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    	cat */*.txt ;;
    
         corpus.18shilue.mc) cd ../../ud-kanbun/18shilue
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    	cat */*.txt ;;
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
         corpus.chuci.mc) cd ../../ud-kanbun/kanripo/kR4a0001
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    	cat 00[12]/*.txt ;;
    
         corpus.center.mc) cd ../../ud-kanbun/center-exam
            cat */*.txt ;;
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
         corpus.pulleyblank.mc) cd ../../ud-kanbun/Pulleyblank
    	for F in Pulleyblank*.txt
    	do set x `egrep '^# text = ' $F | wc`
    	   nawk '
    BEGIN{
      n='${2-1}';
    }
    {
      if($0~/^# text = /){
        if(n<2)
          printf("%s。\n",$0);
        else
          printf("%s、\n",$0);
        n--;
        root=0;
      }
      else if($0==""){
        id++;
        if(n<1)
          printf("%d\t。\t。\tPUNCT\ts,記号,句点,*\t_\t%d\tpunct\t_\tSpaceAfter=No\n\n",id,root);
        else
          printf("%d\t、\t、\tPUNCT\ts,記号,読点,*\t_\t%d\tpunct\t_\tSpaceAfter=No\n\n",id,root);
        root=0;
      }
      else{
        printf("%s\n",$0);
        id=$1;
        if($8=="root")
          root=id;
      }
    }' $F
    	done ;;
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
         esac
       ) | nawk '
    
    /^[^#]/{
      if($0==""){
        if(n>0)
          printf("EOS\n");
        n=0;
      }
      else{
        if(n>=$1)
          printf("EOS\n");
        n=$1;
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
        if(match($10,/Gloss=[^|]+/)>0)
          g=substr($10,RSTART+6,RLENGTH-6);
        else
          g="*";
        printf("%s\t%s,*,*,%s,*,*,%s\n",$2,$5,$3,g);
    
      }
    }
    END{
      if(n>0)
        printf("EOS\n");
    
    }' | mc2ud.nawk > $F
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    done
    
    
    for F in corpus.KT0.mc corpus.kanjikai.mc corpus.misc.mc
    do nawk '
    {
      if($1=="EOS")
        printf("EOS\n");
      else{
        split($2,a,",");
        printf("%s\t%s,%s,%s,%s,*,*,%s,*,*,*\n",$1,a[1],a[2],a[3],a[4],a[7]);
      }
    
    }' ../seed_names2/$F | mc2ud.nawk > $F
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    ( sed -e /EOS/d -e 's/	/,0,0,0,/' corpus.pulleyblank.mc corpus.mencius.mc corpus.lunyu.mc corpus.liji.mc corpus.18shilue.mc corpus.chuci.mc corpus.center.mc corpus.KT0.mc corpus.kanjikai.mc corpus.misc.mc
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
      sed 's/,[^,]*,[^,]*,[^,]*$/,*,*,*/' ../seed_names2/KTp2.*.csv ../seed_names2/name.Noun.girei.csv 
    ) | sort -u | tr , ' ' | nawk '
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
      c=sprintf("cat dict.gloss.csv dict.yasuoka.csv | tr , %c %c",39,39);
    
      while((c|getline)>0)
        w[$1]=1;
      close(c);
    }
    length($1)==3{
      if(w[$1]!=1)
        printf("%s\n",$0);
    
    }' | tr ' ' , | simplify.sh | mc2ud.nawk | sort -u > dict.KTp2.csv
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    sed 's/,[^,]*,[^,]*,[^,]*$/,*,*,*/' ../seed_names2/name.Noun.personal.csv ../seed_names2/name.Noun.surname.csv ../seed_names2/KTp2.Noun.place.?.csv | sort -u | awk -F, '
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    BEGIN{
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
      c=sprintf("cat dict.gloss.csv dict.yasuoka.csv | tr %c\\011%c ,",39,39);
      while((c|getline)>0)
        w[$1,$5,$6,$7,$8]=1;
      close(c);
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
      m=sprintf("%c%c",240,175);
    }
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    {
      s=$14;
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
      if($5=="n"&&$6=="名詞"){
        if($7=="人"&&$8=="姓氏")
          s="[surname]";
        else if($7=="人"&&$8=="名")
          s="[given-name]";
        else if($7=="主体"&&$8=="書物")
          s="[book-name]";
        else if($7=="主体"&&$8=="国名")
          s="[country-name]";
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
        else if($7=="固定物"&&$8=="地名"){
          if(length($1)>7){
    	if($1~/(府|縣|郡)$/)
    	  next;
          }
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
          else if($1~/(海|海)$/)
            next;
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
          s="[place-name]";
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
        }
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
      if(s!="*"&&$11!="*"&&index($1,m)<1&&index($11,m)<1&&w[$1,$5,$6,$7,$8]!=1)
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
        printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,s);
    
    }' | simplify.sh | mc2ud.nawk | sort -u > dict.name.csv
    
    sed 's/,[^,]*,[^,]*,[^,]*$/,*,*,*/' ../seed_names2/Symbol.csv | sort -u | mc2ud.nawk > dict.symbol.csv
    
    Koichi Yasuoka's avatar
    Koichi Yasuoka committed
    if [ ! -s dict.yasuoka.csv ]
    then /bin/rm -f dict.yasuoka.csv
    fi