Sunday, March 29, 2009

Spelled or Spoken Number to Decimal Conversion

Have you ever wanted to parse a text that reads “one hundred thousand and twenty five point seven” to “100025.7”, then the following code may be for you. I had been looking around the internet for some ready-made code that’ll do this to no avail, so this might help others who’re in the same situation I was in.

To use the class, give it your namespace. Instantiate and call TranslateSentenceToString(“String with text to be parsed”). Comprehensive explanation on the code logic is given in the inline documentation.

This code also requires the string replace with ignore case, included in the section after the parser class. To start parsing






using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Utils;

namespace YourNameSpace
{
public class TextToNumericParser
{
public static readonly int TOKEN_INDEX_DEFAULT = -1;
public static readonly int MAX_SEARCH_COUNTER = 1000; //If the parsing is falling into a cycle, this is an insurance, to force the class to quit.

private int _SearchCounter = 0;

public bool SearchMixture = false;

Dictionary<string, long> Specials = new Dictionary<string,long>();
Dictionary<string, long> Elements = new Dictionary<string, long>();
Dictionary<string, long> Exponents = new Dictionary<string, long>();
List<string> WhiteSpaces = new List<string>();

//We keep the LeadingZeroCount as a global variable. This works because this recursion is a straight line. It doesn't branch, but a more suitable solution may be needed.
int LeadingZeroCount = 0;

//List<string> Punctuations = new List<string>(); Not needed, we won't ignore punctuation. Eg. one hundred. two = 100 . 2

private static readonly char Seperator = '\a';

private int CurrentTokenIndex = -1;
private string[] TokensToBeParsed;

private void TickSearchCounter()
{
_SearchCounter++;
if(_SearchCounter>=MAX_SEARCH_COUNTER)
throw new Exception("Search taking too long, arborting");
}

/// <summary>
/// Retrieves the next token to be parsed, or null if it reaches the last token.
/// </summary>
public string NextToken
{
get
{
CurrentTokenIndex++;
//Increment the search counter, and make sure it hasn't hit the ceiling yet.
TickSearchCounter();
try
{
return TokensToBeParsed[CurrentTokenIndex];
}
catch //End of the tokens.
{
return null;
}
}
}

/// <summary>
/// Split the text to be parsed into tokens - to make it easier.
/// </summary>
public string TextToBeParsed
{
//get
//{
// return _TextToBeParsed;
//// On 2nd thought we don't want the user to see the text to be parsed, because it'll be marked with seperator.
//}
set
{
string _TextToBeParsed = "";
_TextToBeParsed = value;
foreach(string special in Specials.Keys)
{
_TextToBeParsed = _TextToBeParsed.Replace(special + " ", Seperator + special + Seperator + " ", true);
}
foreach(string element in Elements.Keys)
{
_TextToBeParsed = _TextToBeParsed.Replace(element + " ", Seperator + element + Seperator + " ", true);
}
foreach (string exponent in Exponents.Keys)
{
_TextToBeParsed = _TextToBeParsed.Replace(exponent + " ", Seperator + exponent + Seperator + " ", true);
}

TokensToBeParsed = _TextToBeParsed.Split(new char[] { '\a' });
}
}

/// <summary>
/// Adds all the discrete tokens that may be parsed.
/// </summary>
public TextToNumericParser()
{
//Instantiate the special character .eg (a thousand dollar) can be translated to 1,000, however it's special because it's only numeric if followed by a number eg. a plane is not numeric.
Specials.Add("a",1);
//Specials.Add("and", 0); //this is not a numeric but may be used as a conjunction **between two numbers** to imply an addition.
Specials.Add("point", 0); //this is not a numeric but may be used to denote a floating point.

//Elements are the most basic numeric.
Elements.Add("zero", 0);
Elements.Add("one", 1);
Elements.Add("two", 2);
Elements.Add("three",3);
Elements.Add("four",4);
Elements.Add("five",5);
Elements.Add("six",6);
Elements.Add("seven",7);
Elements.Add("eight",8);
Elements.Add("nine",9);
Elements.Add("ten",10);

//If allowed search mixture, this parser will attempt to parse an already numeric number, so that for example two hundred thousand . 5 will be parsed to 200,000.5
if (SearchMixture)
{
Elements.Add("0", 0);
Elements.Add("1", 1);
Elements.Add("2", 2);
Elements.Add("3", 3);
Elements.Add("4", 4);
Elements.Add("5", 5);
Elements.Add("6", 6);
Elements.Add("7", 7);
Elements.Add("8", 8);
Elements.Add("9", 9);
}
Elements.Add("eleven",11);
Elements.Add("twelve",12);
Elements.Add("thirteen",13);
Elements.Add("fourteen", 14);
Elements.Add("fifteen", 15);
Elements.Add("sixteen", 16);
Elements.Add("seventeen", 17);
Elements.Add("eightteen", 18);
Elements.Add("nineteen", 19);
Elements.Add("twenty", 20);
Elements.Add("thirty", 30);
Elements.Add("fourty", 40);
Elements.Add("fifty", 50);
Elements.Add("sixty", 60);
Elements.Add("seventy", 70);
Elements.Add("eighty", 80);
Elements.Add("ninety", 90);

//These are the exponent order.
Exponents.Add("hundred", 100);
Exponents.Add("thousand", 1000);
Exponents.Add("million", (long) Math.Pow(10,6));
Exponents.Add("billion", (long)Math.Pow(10, 9));
Exponents.Add("trillion", (long) Math.Pow(10, 12));

//White spaces. The existence of white spaces should be ignored when parsing a number.
WhiteSpaces.Add("");
WhiteSpaces.Add("\n");
WhiteSpaces.Add("\t");
WhiteSpaces.Add("\r");
WhiteSpaces.Add("\v");

}

/// <summary>
/// Parse all numbers and concat all the tokens result into a string.
/// </summary>
/// <param name="TextToParse">The text that needs parsing.</param>
/// <returns>A string translation with all the spelled numeric turned into numbers.</returns>
public string TranslateToString(string TextToParse)
{
List<string> ParsedText = TranslateSentence(TextToParse);
string TranslatedText = "";
foreach (string Text in ParsedText)
{
TranslatedText += Text;
}
return TranslatedText;
}

/// <summary>
/// Parse spelled out numbers in the sentence.
/// </summary>
/// <param name="TextToParse">The text that needs parsing.</param>
/// <returns>All the tokens that resulted from the parsing.</returns>
public List<string> TranslateSentence(string TextToParse)
{
TextToBeParsed = TextToParse;
return TranslateSentence();
}

/// <summary>
/// You must set the text to be parsed first.
/// NOTE: TODO. this will not detect one hundred - thousand ...
/// </summary>
/// <returns>The tokenized translation</returns>
private List<string> TranslateSentence()
{
List<string> Translation = new List<string>();
//Reset the current index.
CurrentTokenIndex = -1;

for(string Token = NextToken; CurrentTokenIndex < TokensToBeParsed.Length; Token = NextToken)
//while (CurrentTokenIndex < TokensToBeParsed.Length)
{
if (MayBeNumeric(Token))
Translation.Add(TranslateMayBeDecimal(Token));
//Definitely could not be numeric.
else
Translation.Add(Token);
}
//Reset the current index.
CurrentTokenIndex = -1;

return Translation;
}

/// <summary>
/// Translate a decimal (made of 2 integers, seperated by a "point")
/// </summary>
/// <param name="Token"></param>
/// <returns></returns>
private string TranslateMayBeDecimal(string Token)
{
//return null;
string ReturnToken = Token;

long? NumericString = GetInteger(Token);

double? FractionString = GetFraction(NextToken);

if (NumericString != null)
if (FractionString != null)
ReturnToken = (NumericString + FractionString).ToString();
else
ReturnToken = NumericString.ToString();
if (FractionString != null)
ReturnToken = FractionString.ToString();

//Since this will always return a value regardless of whether a numeric conversion took place or not and GetNumeric successfully consoume the token,
//This method ALWAYS consumes a token, but will return a string if it cannot translate to some decimal.
if(NumericString == null)
CurrentTokenIndex++;
return ReturnToken;
}

/// <summary>
/// Gets the integer portion of a decimal number.
/// </summary>
/// <param name="Token">The token that needs to be translated into an integer.</param>
/// <returns>The integer translation of the so said number.</returns>
private long? GetInteger(string Token)
{

long? exp, Temp, nextNumber, thisNumber;

if (Token != null) //If this token is null, ie, if the previous call to "NextToken" went over the number of tokens, then we do nothing.
{

//Continue traversing while it's a white space.
if (Token.Trim().Length == 0)
{
long? Number = GetInteger(NextToken);
//If at the end of the whitespace, we reached a number, then return it.
if (Number != null)
return Number;
}
else if (Token.Trim().ToLower() == "a")
{
exp = GetExponent(NextToken);
//If the a is followed by an "a" is followed by an exponent, then this "a" is equivalent to the number "1" and we treat it as such.
if (exp != 1)
{
nextNumber = GetInteger(NextToken);

//For example a hundred thousand and twelve, we want to add the "100,000" with the "12"
if (nextNumber != null)
return exp + nextNumber;
else
return exp;
}
}
else if (Token.ToLower().Trim() == "and")
{
Temp = GetInteger(NextToken);
//If the and is followed by a number, return that number, otherwise we want to spit the and back out, because we can't process this.
if (Temp != null)
return Temp;
}
else if (Elements.Keys.Contains(Token.ToLower().Trim()))
{
//Get thisNumber whether it's spelled out, or in digit. - See constructor.
thisNumber = Elements[Token.ToLower().Trim()];
//Try to see if the number is followed by an exponent.
exp = GetExponent(NextToken);
//Get the number that follows the exponent (if there's one)
nextNumber = GetInteger(NextToken);

if (exp.Value != 1 || Elements[Token.ToLower().Trim()] >= 10)//If the number is NOT a digit spelled out one by one, or if it has an exponent component, then we want to add them together,
{
if (nextNumber != null)
return thisNumber * exp + nextNumber;
else
return thisNumber * exp;
}
else //Otherwise we want to interprete it as if it're a digit spelled one by one.
{
long? ReturnTemp = 0;
//We multiply this number by the number of leading zero
if (nextNumber != null)
ReturnTemp = long.Parse((thisNumber * Math.Pow(10, LeadingZeroCount)).ToString() + nextNumber.ToString());
else
ReturnTemp = thisNumber;
//We need to increment any leading zero to sorta "borrow" it, otherwise they'll dissapear on the way back from recursion, when we cast the string to long.
if (thisNumber == 0)
LeadingZeroCount++;
//If this number is not zero, then it should be accounted for above.
else
LeadingZeroCount = 0;
//Return the number to the caller.
return ReturnTemp;
}
}

}
//Cannot consume the token, so we need to decrement the index back to the previous state.
CurrentTokenIndex--;

return null;
}

/// <summary>
/// This is used to get the fraction part ie. the part that follows after a "." (dot) or "point"
/// </summary>
/// <param name="Token"></param>
/// <returns></returns>
private double? GetFraction(string Token)
{
if (Token != null)
{
//Continue traversing while it's a white space.
if (Token.Trim().Length == 0)
{
double? Fraction = GetFraction(NextToken);
//If at the end of the whitespace, we reached number;
if (Fraction != null)
return Fraction;
}
else if (Token.Trim().ToLower() == "." || Token.Trim().ToLower() == "point")
{
long? NumberPart = GetInteger(NextToken);
//If the point was followed by a number, then return the fraction, otherwise we need to backtrack;
if (NumberPart != null)
{
return double.Parse("." + NumberPart.ToString()) * Math.Pow(10, -1 * LeadingZeroCount);
}
}
}
CurrentTokenIndex--;
return null;
}

/// <summary>
/// Gets the exponent part. Eg it'll return "100" from "a hundred"
/// </summary>
/// <param name="Token">The exponent part</param>
/// <returns>The exponent value.</returns>
private long? GetExponent(string Token)
{
if(Token!=null)
Token = Token.Trim().ToLower(); //Get rid of white spaces and caps.

//Continue traversing while it's a white space.
if (Token != null && Token.Trim().Length == 0)
{

long? exp = GetExponent(NextToken);
//If at the end of the whitespace, we reached a null or a non-exponent
if (exp.Value == 1)
CurrentTokenIndex--; //We cannot consume.
else
return exp; //Otherwise, we found an exponent and return it.
}
//If an exponent is found.
else if (Token != null && Exponents.Keys.Contains(Token))
return Exponents[Token] * GetExponent(NextToken);
//Cannot consume, neither white space nor an exponent.
else
CurrentTokenIndex--;
return 1;
}

/// <summary>
/// Checks if the given token is possibly a number - a necessary, insufficient check.
/// </summary>
/// <param name="Token">The token to be checked.</param>
/// <returns></returns>
private bool MayBeNumeric(string Token)
{
return Specials.Keys.Contains(Token.Trim().ToLower()) || Elements.Keys.Contains(Token.Trim().ToLower());
}

}
}


You also need to create the following helper class:



using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace Utils
{
public static class StringExtension
{
/// <summary>
/// String replace function that support
/// </summary>
/// <param name="OrigString">Original input string</param>
/// <param name="FindString">The string that is to be replaced</param>
/// <param name="ReplaceWith">The replacement string</param>
/// <param name="Instance">Instance of the FindString that is to be found. if Instance = -1 all are replaced</param>
/// <param name="CaseInsensitive">Case insensitivity flag</param>
/// <returns>updated string or original string if no matches</returns>
public static string Replace(this string OrigString, string FindString,
string ReplaceWith, int Instance,
bool CaseInsensitive)
{
if (Instance == -1)
return OrigString.Replace(FindString, ReplaceWith, CaseInsensitive);

int at1 = 0;
for (int x = 0; x < Instance; x++)
{

if (CaseInsensitive)
at1 = OrigString.IndexOf(FindString, at1, OrigString.Length - at1, StringComparison.OrdinalIgnoreCase);
else
at1 = OrigString.IndexOf(FindString, at1);

if (at1 == -1)
return OrigString;

if (x < Instance - 1)
at1 += FindString.Length;
}

return OrigString.Substring(0, at1) + ReplaceWith + OrigString.Substring(at1 + FindString.Length);
}

/// <summary>
/// Replaces a substring within a string with another substring with optional case sensitivity turned off.
/// </summary>
/// <param name="OrigString">String to do replacements on</param>
/// <param name="FindString">The string to find</param>
/// <param name="ReplaceString">The string to replace found string wiht</param>
/// <param name="CaseInsensitive">If true case insensitive search is performed</param>
/// <returns>updated string or original string if no matches</returns>
public static string Replace(this string OrigString, string FindString,
string ReplaceString, bool CaseInsensitive)
{
int at1 = 0;
while (true)
{
if (CaseInsensitive)
at1 = OrigString.IndexOf(FindString, at1, OrigString.Length - at1, StringComparison.OrdinalIgnoreCase);
else
at1 = OrigString.IndexOf(FindString, at1);

if (at1 == -1)
return OrigString;

OrigString = OrigString.Substring(0, at1) + ReplaceString + OrigString.Substring(at1 + FindString.Length);

at1 += ReplaceString.Length;
}

return OrigString;
}
}
}