Voice Recognition with Arduino

Recently I built a voice recognition circuit using Arduino. Voice recognition is not that easy with Arduino, It requires more processing and voice analyzing power. Keep in mind that all you have to deal with is a ATMega328P microcontroller running on 16MHz. That's why some modules like BitVoicer is outsourcing the processing power to the PC(Serial/UART or TCP/IP). But modules like uSpeech are completely independent of the PC and can run on its own. Thus, I used uSpeech.

uSpeech is a limited voice recognition software for the AVR. So, you can not directly write the text and expect it to recognize the speech. You need to deal with Phonemes. Hence,a fair amount of patient is needed. List of phonemes you would get is as below;

Phoneme      |   Literal       
--------------------------------------------------------------------
e            | The e sound.
--------------------------------------------------------------------   
h            | The `/sh/` sound. It can also be raised by 
             | `ch`, `j` and `z`
--------------------------------------------------------------------
v            | The `v` sound, occasionally triggered by  , 'z'
             | (may need to be fixed)
--------------------------------------------------------------------
f            | The `f` sound. 
--------------------------------------------------------------------
s            | The 's' sound.
--------------------------------------------------------------------
o            | 'a','o','i','r','l','m','n' and 'u' sounds. 
--------------------------------------------------------------------
' '          | Too Quiet for anything
--------------------------------------------------------------------

In order to get voice input you need a condenser mic with an operational amplifier (op-amp). The easiest way is to buy a MIC module which all are integrated. Try an ebay search.

You can wire it as below. 220 ohm resistors are used.

To Zoom in: click on the image

This module has LM393 Op-amp chip and Electret condenser microphone.

This is what i bought from the ebay. Just 1.88 USD.

In uSpeech examples, what they actually compared is a single phoneme.It recognizes 6 phonemes (f, e, o, v, s, h). So it is handy for simple commands (if they have one of those phonemes), but it isn’t generalized at all. What i did is, I used a char[32] `buffer` to hold phoneme patterns. Then receiving `buffer` is compared with a list of predefined phoneme patterns. For instance my predefined patterns for green,orange and white are accordingly; "vvvoeeeeeeeofff", "hhhhhvoovvvvf", "hooooooffffffff". To compare which is the closest i used, minimum edit distance(Levenshtein distance). But how to ignore irrelevant patterns? you can use a threshold.

#include <uspeech.h>
#define ledGreen 7
#define ledOrange 6
#define ledWhite 5
#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
signal voice(A0);
const int BUFFER_MAX_PHONEMES=32;
char inputString[BUFFER_MAX_PHONEMES]; // Allocate some space for the string
byte index = 0; // Index into array; where to store the character
//
const int DICT_MAX_ELEMNTS=3;
char dict[DICT_MAX_ELEMNTS][BUFFER_MAX_PHONEMES]={"vvvoeeeeeeeofff","hhhhhvoovvvvf","hooooooffffffff"};
int LOWEST_COST_MAX_THREASHOLD=20;
void setup(){
  /*
  Phoneme      |   Literal       
  --------------------------------------------------------------------
  e            | The e sound.
  --------------------------------------------------------------------   
  h            | The `/sh/` sound. It can also be raised by 
               | `ch`, `j` and `z`
  --------------------------------------------------------------------
  v            | The `v` sound, occasionally triggered by  , 'z'
               | (may need to be fixed)
  --------------------------------------------------------------------
  f            | The `f` sound. 
  --------------------------------------------------------------------
  s            | The 's' sound.
  --------------------------------------------------------------------
  o            | 'a','o','i','r','l','m','n' and 'u' sounds. 
  --------------------------------------------------------------------
  ' '          | Too Quiet for anything
  --------------------------------------------------------------------
  */
  voice.f_enabled = true;
  voice.minVolume = 1500;
  voice.fconstant = 500;
  voice.econstant = 2;
  voice.aconstant = 4;
  voice.vconstant = 6;
  voice.shconstant = 10;
  voice.calibrate();
  Serial.begin(9600);
  pinMode(ledGreen, OUTPUT); 
  pinMode(ledOrange, OUTPUT); 
  pinMode(ledWhite, OUTPUT);
}

void loop(){
    voice.sample();
    char p = voice.getPhoneme();
    if(p==' ' || index >= BUFFER_MAX_PHONEMES){
      if(strLength(inputString)>0){
         Serial.println("received:"+String(inputString));
         parseCommand(inputString);
         inputString[0]=0;//clear string char array
         index=0;
      }
    }else{
      //printArray(voice.arr);
      inputString[index] = p; // Store it
      index++;
      inputString[index] = '\0'; // Null terminate the string
    }
}

char* guessWord(char* target){
  int len = strlen(target);//held target length
  
  for(int i=0;i<DICT_MAX_ELEMNTS;i++){
    //simple validation
    if(dict[i]==target){
      return dict[i];
    }
  }
  
  unsigned int cost[DICT_MAX_ELEMNTS];//held minimum distance cost

  //calculating each words cost and hits
  for(int j=0;j<DICT_MAX_ELEMNTS;j++){//loop through the dictionary
      cost[j]=levenshtein(dict[j],target);
      Serial.println("dict[j]="+String(dict[j])+" target="+String(target)+" cost="+String(cost[j]));
  }    
  
  //Determining lowest cost but still all letters in the pattern hitting word
  int lowestCostIndex=-1;
  int lowestCost=LOWEST_COST_MAX_THREASHOLD;
  for(int j=0;j<DICT_MAX_ELEMNTS;j++){//loop through the dictionary
    //Serial.println("dict[j]="+dict[j]+" dict[j].length()="+String(strlen(dict[j]))+" cost="+String(cost[j])+" hits="+String(hits[j])+" j="+String(j));
    if(cost[j]<lowestCost){
      lowestCost = cost[j];
      lowestCostIndex=j;
    }
  }
  
  //Serial.println("lowestCostIndex="+String(lowestCostIndex)+" lowestCost="+String(lowestCost));
  
  if(lowestCostIndex>-1){
    //Serial.println("lowestCost="+String(lowestCost)+" lowestCostIndex="+String(lowestCostIndex));
    return dict[lowestCostIndex];
  }else{
    return "";
  }
}

void parseCommand(char* str){
  char *gWord = guessWord(str);
  Serial.println("guessed :"+String(gWord));
  if(gWord==""){
    return;
  }else if(gWord==dict[0]){
    digitalWrite(ledGreen, HIGH);
    digitalWrite(ledOrange, LOW);
    digitalWrite(ledWhite, LOW);
  }else if(gWord==dict[1]){
    digitalWrite(ledGreen, LOW);
    digitalWrite(ledOrange, HIGH);
    digitalWrite(ledWhite, LOW);
  }else if(gWord==dict[2]){
    digitalWrite(ledGreen, LOW);
    digitalWrite(ledOrange, LOW);
    digitalWrite(ledWhite, HIGH);
  }
}

unsigned int levenshtein(char *s1, char *s2) {
    unsigned int s1len, s2len, x, y, lastdiag, olddiag;
    s1len = strlen(s1);
    s2len = strlen(s2);
    unsigned int column[s1len+1];
    for (y = 1; y <= s1len; y++)
        column[y] = y;
    for (x = 1; x <= s2len; x++) {
        column[0] = x;
        for (y = 1, lastdiag = x-1; y <= s1len; y++) {
            olddiag = column[y];
            column[y] = MIN3(column[y] + 1, column[y-1] + 1, lastdiag + (s1[y-1] == s2[x-1] ? 0 : 1));
            lastdiag = olddiag;
        }
    }
    return(column[s1len]);
}

int strLength(char const* s) {
    int i = 0;
    while (s[i] != '\0' && s[i] != ' ')
        ++i;

    return i;
}

How to install uSpeech? download the zip file from here. Then, extract the files in to your arduino `Library` folder. In my case, it is on 'C:\Program Files (x86)\Arduino\libraries'. It should be in a separate folder, let's say 'uSpeech'. Finally, restart the Arduino IDE.

Tiriboy's Blog

Voice Recognition with Arduino

Rasika Perera

Voice Recognition with Arduino

Rasika Perera

RELATED POSTS