#!/bin/sh

LANG=C; export LANG

find wav -name PROMPTS -exec cat {} \; | sed 's/mfc/wav/g' |
     tr -d '\r' | tr -d "+()" | grep -v "§" | grep -v BAD_ | 
     sed 's:../../../Audio/MFCC/XXkHz_YYbit/MFCC_0_D/::g' |
     awk '{printf "<s> "; for (i=2;i<=NF;i++) printf $i " " ; printf "</s> (" $1 ")\n"}' > voxforge_en_sphinx_full.transcription

cat voxforge_en_sphinx_full.transcription |  
  awk '{print $NF}' | tr -d "()" | sed 's/ $//' > voxforge_en_sphinx_full.fileids

cat -n voxforge_en_sphinx_full.transcription |
awk '{if ($1 ~ /.*9$/)
      {
       for (i=2; i<=NF; i++)
          printf("%s ",$i);
       printf("\n");
      }
     }' | sed 's/ $//'  > voxforge_en_sphinx_test.transcription

cat -n voxforge_en_sphinx_full.transcription |
awk '{if ($1 ~ /.*[^9]$/)
      {
       for (i=2; i<=NF; i++)
          printf("%s ",$i);
       printf("\n");
      }
     }' | sed 's/ $//' > voxforge_en_sphinx_train.transcription

cat -n voxforge_en_sphinx_full.fileids |
awk '{if ($1 ~ /.*9$/)
      {
       for (i=2; i<=NF; i++)
          printf("%s ",$i);
       printf("\n");
      }
     }' | sed 's/ $//' > voxforge_en_sphinx_test.fileids

cat -n voxforge_en_sphinx_full.fileids |
awk '{if ($1 ~ /.*[^9]$/)
      {
       for (i=2; i<=NF; i++)
          printf("%s ",$i);
       printf("\n");
      }
     }' | sed 's/ $//' > voxforge_en_sphinx_train.fileids

cat voxforge_en_sphinx_full.transcription |  
  awk '{for (i=1;i<NF;i++) print $i}' | sort | uniq > voxforge_en_sphinx.vocab


