Implementación del algoritmo (función) Soundex en diferentes lenguajes de programación

Implementación del algoritmo (función) Soundex en diferentes lenguajes de programación

La función Soundex es un algoritmo fonético para indexar nombres por medio del sonido de su pronunciación en Inglés, los códigos Soundex de diferentes cadenas de textos pueden ser comparadas para ver que tan similar son 2 cadenas de texto cuando son habladas.

El primer caracteres del código soundex es generado por el primer caracter de la expresión dada, convertida en mayuscula. Los siguientes caracteres del código son numeros que representan las letras en uan expresión. Las letras A,E, I, O, U, H, W y Y son ignoradas a no ser que estas sean la primera letra de la cadena de texto. Todos los caracteres internacionales fuera del rango A-Z son tratados como vocales. Por lo tanto, 2 cadenas que suenan casi igual deberían tener el mismo indice soundex. Por ejemplo, las palabras "texto" y "tixto" deberían producir un código soundex de T230.

En este artículo, encontrarás la implementación del conocido algoritmo en los siguiente lenguajes de programación:

Comencemos !

C

#include <stdio.h>

static char code[128] = { 0 };

const char* soundex(const char *s)
{
	static char out[5];
	int c, prev, i;
 
	out[0] = out[4] = 0;
	if (!s || !*s) return out;
 
	out[0] = *s++;
 
	/* La primera letra, aunque no sea forzada, puede afectar a la siguiente letra: Pfister */
	prev = code[(int)out[0]];
	for (i = 1; *s && i < 4; s++) {
		if ((c = code[(int)*s]) == prev) continue;
 
		if (c == -1) prev = 0;	/* vowel as separator */
		else if (c > 0) {
			out[i++] = c + '0';
			prev = c;
		}
	}
	while (i < 4) out[i++] = '0';
	return out;
}

void add_code(const char *s, int c)
{
	while (*s) {
		code[(int)*s] = code[0x20 ^ (int)*s] = c;
		s++;
	}
}
 
void init()
{
	static const char *cls[] =
		{ "AEIOU", "", "BFPV", "CGJKQSXZ", "DT", "L", "MN", "R", 0};
	int i;
	for (i = 0; cls[i]; i++)
		add_code(cls[i], i - 1);
}

Uso

int main()
{
    init();
    /* J126 */
    printf(soundex("Javascript"));
 
    return 0;
}

C#

using System.Text.RegularExpressions;

public static class Soundex
{
    public static string For(string word)
    {
        const int MaxSoundexCodeLength = 4;

        var soundexCode = new StringBuilder();
        var previousWasHOrW = false;

        word = Regex.Replace(
            word == null ? string.Empty : word.ToUpper(),
                @"[^\w\s]",
                    string.Empty);

        if (string.IsNullOrEmpty(word))
            return string.Empty.PadRight(MaxSoundexCodeLength, '0');

        soundexCode.Append(word.First());

        for (var i = 1; i < word.Length; i++)
        {
            var numberCharForCurrentLetter =
                GetCharNumberForLetter(word[i]);

            if (i == 1 &&
                    numberCharForCurrentLetter ==
                        GetCharNumberForLetter(soundexCode[0]))
                continue;

            if (soundexCode.Length > 2 && previousWasHOrW &&
                    numberCharForCurrentLetter ==
                        soundexCode[soundexCode.Length - 2])
                continue;

            if (soundexCode.Length > 0 &&
                    numberCharForCurrentLetter ==
                        soundexCode[soundexCode.Length - 1])
                continue;

            soundexCode.Append(numberCharForCurrentLetter);

            previousWasHOrW = "HW".Contains(word[i]);
        }

        return soundexCode
                .Replace("0", string.Empty)
                    .ToString()
                        .PadRight(MaxSoundexCodeLength, '0')
                            .Substring(0, MaxSoundexCodeLength);
    }

    private static char GetCharNumberForLetter(char letter)
    {
        if ("BFPV".Contains(letter)) return '1';
        if ("CGJKQSXZ".Contains(letter)) return '2';
        if ("DT".Contains(letter)) return '3';
        if ('L' == letter) return '4';
        if ("MN".Contains(letter)) return '5';
        if ('R' == letter) return '6';

        return '0';
    }
}

Uso

Soundex.For("CSharp Language") == Soundex.For("CSherp Language"); // True as C614 == C614

D

The D standard library (Phobos) contains already a soundex function.

import std.stdio: writeln;
import std.string: soundex;
 
void main() {
    assert(soundex("soundex") == "S532");
    assert(soundex("example") == "E251");
    assert(soundex("ciondecks") == "C532");
    assert(soundex("ekzampul") == "E251");
    assert(soundex("Robert") == "R163");
    assert(soundex("Rupert") == "R163");
    assert(soundex("Rubin") == "R150");
    assert(soundex("Ashcraft") == "A261");
    assert(soundex("Ashcroft") == "A261");
    assert(soundex("Tymczak") == "T522");
}

F#

let americanSoundex (x : string) = 
    let toString (xs : char list) = new System.String(xs |> Array.ofList)
    let _americanSoundex =
        let toUpper (x : string) = x.ToUpper()
        let toArray (x : string) = x.ToCharArray()

        let f1 ch = 
            match ch with
            | 'H' | 'W' -> false
            | _ -> true

        let f2 ch =
            match ch with
            | 'B' | 'F' | 'P' | 'V' -> '1'
            | 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' -> '2'
            | 'D' | 'T' -> '3'
            | 'L' -> '4'        
            | 'M' | 'N' -> '5'
            | 'R' -> '6'
            | _ -> ch

        let rec f3 xs =
            match xs with
            | h0 :: h1 :: t -> h0 :: f3 (if (h0 = h1) then t else (h1 :: t))
            | h :: _ -> [h]
            | _ -> []

        let f4 ch = 
            match ch with
            | 'A' | 'E' | 'I' | 'O' | 'U' | 'Y' -> false
            | _ -> true

        let f5 ch first =
            if ('0' <= ch && ch <= '9') then first
            else ch

        let f6 xs =
            let len = List.length xs
            seq{for i = 0 to 3 - len do yield '0'} 
                |> Seq.append (xs |> Seq.take (System.Math.Min(4, len)))
                |> Seq.toList

        let a = x |> toUpper |> toArray |> Array.toList
        let b = a |> List.filter f1 //1
        let c = b |> List.map f2 //2
        let d = c |> f3 //3
        let e = d |> List.tail |> List.filter f4 //4
        let f = f5 (d |> List.head) (a |> List.head) :: e //5
        f6 f //6
    
    if (x.Length > 0) then toString(_americanSoundex) 
    else "0000"

["Robert"; "Rupert"; "Robbert"; "Rubin"; 
 "Beer"; "Bear"; "Bearer"; 
 "Smith"; "Smyth";
 "Ashcraft"; "Ashcroft";
  "Tymczak"; "Pfister"] 
 |> List.map (fun x -> (x, americanSoundex x)) |> List.iter (fun (x, y) -> printfn "%-8s = %s" x y)

(*

Robert   = R163
Rupert   = R163
Robbert  = R163
Rubin    = R150
Beer     = B600
Bear     = B600
Bearer   = B660
Smith    = S530
Smyth    = S530
Ashcraft = A261
Ashcroft = A261
Tymczak  = T522
Pfister  = P236

*)

Go

package myPackageName

import (
	"bytes"
	"strings"
	"fmt"
)

const codeLen = 4

var codes = map[string]string{
	"a": "",
	"b": "1",
	"c": "2",
	"d": "3",
	"e": "",
	"f": "1",
	"g": "2",
	"h": "",
	"i": "",
	"j": "2",
	"k": "2",
	"l": "4",
	"m": "5",
	"n": "5",
	"o": "",
	"p": "1",
	"q": "2",
	"r": "6",
	"s": "2",
	"t": "3",
	"u": "",
	"v": "1",
	"w": "",
	"x": "2",
	"y": "",
	"z": "2",
}

func Soundex(s string) string {
	var encoded bytes.Buffer
	encoded.WriteByte(s[0])

	for i := 1; i < len(s); i++ {
		if encoded.Len() == codeLen {
			break
		}

		previous, current := strings.ToLower(string(s[i-1])), strings.ToLower(string(s[i]))

		var next string
		if i+1 < len(s) {
			next = strings.ToLower(string(s[i+1]))
		}

		if (current == "h" || current == "w") && (codes[previous] == codes[next]) {
			i = i + 1
			continue
		}

		if c, ok := codes[current]; ok && len(c) > 0 {
			encoded.WriteByte(c[0])
		}

		if codes[current] == codes[next] {
			i = i + 1
			continue
		}
	}

	if encoded.Len() < codeLen {
		padding := strings.Repeat("0", codeLen-encoded.Len())
		encoded.WriteString(padding)
	}

	return strings.ToUpper(encoded.String())
}

Uso

func main() {
        /* J126 */
 	fmt.Println(Soundex("Javascript"))
}

Java

private static String getCode(char c){
  switch(c){
    case 'B': case 'F': case 'P': case 'V':
      return "1";
    case 'C': case 'G': case 'J': case 'K':
    case 'Q': case 'S': case 'X': case 'Z':
      return "2";
    case 'D': case 'T':
      return "3";
    case 'L':
      return "4";
    case 'M': case 'N':
      return "5";
    case 'R':
      return "6";
    default:
      return "";
  }
}
 
public static String soundex(String s){
  String code, previous, soundex;
  code = s.toUpperCase().charAt(0) + "";
  previous = "7";
  for(int i = 1;i < s.length();i++){
    String current = getCode(s.toUpperCase().charAt(i));
    if(current.length() > 0 && !current.equals(previous)){
      code = code + current;
    }
    previous = current;
  }
  soundex = (code + "0000").substring(0, 4);
  return soundex;
}

Uso

public static void main(String[] args){
    System.out.println(soundex("Soundex"));//S532
    System.out.println(soundex("Example"));//E251
    System.out.println(soundex("Sownteks"));//S532
    System.out.println(soundex("Ekzampul"));//E251
}

Javascript

var soundex = function(s) {
    var a = s.toLowerCase().split(''),
        f = a.shift(),
        r = '',
        codes = { a: '', e: '', i: '', o: '', u: '', b: 1, f: 1, p: 1, v: 1, c: 2, g: 2, j: 2, k: 2, q: 2, s: 2, x: 2, z: 2, d: 3, t: 3, l: 4, m: 5, n: 5, r: 6 };

    r = f +
        a
        .map(function(v, i, a) {
            return codes[v]
        })
        .filter(function(v, i, a) {
            return ((i === 0) ? v !== codes[f] : v !== a[i - 1]);
        })
        .join('');

    return (r + '000').slice(0, 4).toUpperCase();
};

Uso

soundex("Javascript") == soundex("Jabascript"); // True as J126 == J126

Objective-C

You can found the implementation of the Soundex algorithm Objective-C in this github gist , written by Darkseed.

PHP

PHP has already soundex as a built-in function that calculates the soundex key of a string.

Uso

soundex("PHP Server Language") == soundex("PHP Serber language"); // True as P100 == P100 

Python

Function

def get_soundex(name):
	"""Get the soundex code for the string"""
	name = name.upper()

	soundex = ""
	soundex += name[0]

	dictionary = {"BFPV": "1", "CGJKQSXZ":"2", "DT":"3", "L":"4", "MN":"5", "R":"6", "AEIOUHWY":"."}

	for char in name[1:]:
		for key in dictionary.keys():
			if char in key:
				code = dictionary[key]
				if code != soundex[-1]:
					soundex += code

	soundex = soundex.replace(".", "")
	soundex = soundex[:4].ljust(4, "0")

	return soundex

Uso

    list = ["Smith", "Smythe", "Robert", "Rupert", "Schultz", "Shultz"]

	print("NAME\t\tSOUNDEX")
	for name in list:
		print("%s\t\t%s" % (name, get_soundex(name)))

Library

If you prefer to use a library, you can use the fuzzy package (which uses C Extensions (via Pyrex) for speed).

Ruby

class String
 
  SoundexChars = 'BFPVCGJKQSXZDTLMNR'
  SoundexNums  = '111122222222334556'
  SoundexCharsEx = '^' + SoundexChars
  SoundexCharsDel = '^A-Z'
 
  # desc: http://en.wikipedia.org/wiki/Soundex
  def soundex(census = true)
    str = self.upcase.delete(SoundexCharsDel)
    str[0,1] + str[1..-1].delete(SoundexCharsEx).
                          tr_s(SoundexChars, SoundexNums)\
                          [0 .. (census ? 2 : -1)].
                          ljust(3, '0') rescue ''
  end
 
  def sounds_like(other)
    self.soundex == other.soundex
  end
end

Uso

%w(Soundex Sownteks Example Ekzampul foo bar).each_slice(2) do |word1, word2|
  [word1, word2].each {|word| puts '%-8s -> %s' % [word, word.soundex]}
 
  print "'#{word1}' "
  print word1.sounds_like(word2) ? "sounds" : "does not sound"
  print " like '#{word2}'\n"
end

#Soundex  -> S532
#Sownteks -> S532
#'Soundex' sounds like 'Sownteks'
#Example  -> E251
#Ekzampul -> E251
#'Example' sounds like 'Ekzampul'
#foo      -> F000
#bar      -> B600
#'foo' does not sound like 'bar'

Scala

def soundex(s:String)={
   var code=s.head.toUpper.toString
   var previous=getCode(code.head)
   for(ch <- s.drop(1); current=getCode(ch.toUpper)){
      if (!current.isEmpty && current!=previous)
         code+=current
      previous=current
   }
   code+="0000"
   code.slice(0,4)
}
 
def getCode(c:Char)={
   val code=Map("1"->List('B','F','P','V'),
      "2"->List('C','G','J','K','Q','S','X','Z'),
      "3"->List('D', 'T'),
      "4"->List('L'),
      "5"->List('M', 'N'),
      "6"->List('R'))
 
   code.find(_._2.exists(_==c)) match {
      case Some((k,_)) => k
      case _ => ""
   }
}

Uso

def main(args: Array[String]): Unit = {
   val tests=Map(
      "Soundex"     -> "S532",
      "Euler"	    -> "E460",
      "Gauss"	    -> "G200",
      "Hilbert"	    -> "H416",
      "Knuth"	    -> "K530",
      "Lloyd"	    -> "L300",
      "Lukasiewicz" -> "L222",
      "Ellery"	    -> "E460",
      "Ghosh"	    -> "G200",
      "Heilbronn"   -> "H416",
      "Kant"	    -> "K530",
      "Ladd"	    -> "L300",
      "Lissajous"   -> "L222",
      "Wheaton"	    -> "W350",
      "Ashcraft"    -> "A226",
      "Burroughs"   -> "B622",
      "Burrows"	    -> "B620",
      "O'Hara"	    -> "O600")
 
   tests.foreach{(v)=>
      val code=soundex(v._1)
      val status=if (code==v._2) "OK" else "ERROR"
      printf("Name: %-20s  Code: %s   Found: %s  - %s\n", v._1, v._2, code, status)
   }
}

Swift

The class wrriten by clifford in this github repository is the implementation of the original Soundex algorithm in the Swift language.

//
//  Soundex.swift
//  speller
//
//  Created by Clifford Helsel on 4/28/16.
//
//  Based on standard Soundex algorithm and loosely ported from Apache Commons
//  https://commons.apache.org/proper/commons-codec/apidocs/src-html/org/apache/commons/codec/language/Soundex.html


public class Soundex {
    
    private static let en_mapping_string = Array("01230120022455012623010202".characters)
    private static let en_alphabet = Array("ABCDEFGHIJKLMNOPQRSTUVWXYZ".characters)
    private let mapping: [Character:Character] = Soundex.buildMapping(codes:en_alphabet,alphabet:en_mapping_string)
    
    private static func buildMapping(codes: Array<Character>, alphabet: Array<Character>) -> [Character:Character] {
        var retval: [Character:Character] = [:]
        for (index,code) in codes.enumerated() {
            retval[code] = alphabet[index]
        }
        return retval
    }
    
    private var soundexMapping: Array<Character> = Array(repeating:" ",count:4)
    
    private func getMappingCode(s: String, index:Int) -> Character {
        let i = s.index(s.startIndex, offsetBy: index)
        
        let mappedChar = mapChar(c:s[i])
        
        if (index>1 && !(mappedChar=="0"))
        {
            let j = s.index(s.startIndex,offsetBy:index-1)
            
            let hwChar = s[j]
            
            if (hwChar=="H" || hwChar=="W")
            {
                let k = s.index(s.startIndex,offsetBy:index-2)
                let prehwChar = s[k]
                let firstCode = mapChar(c:prehwChar)
                if (firstCode==mappedChar || "H"==prehwChar || "W"==prehwChar) {
                    return "0"
                }
            }
        }
        
        return mappedChar
    }
    
    private func mapChar(c: Character) -> Character {
        if let val = mapping[c] {
            return val
        }
        return "0" // not specified in original Soundex specification, if character is not found, code is 0
    }
    
    public func soundex(of: String) -> String {
        
        guard (of.characters.count>0) else {
            return ""
        }
        
        let str=of.uppercased()
        
        var out: Array<Character> = Array("    ".characters)
        var last: Character = " "
        var mapped: Character = " "
        var incount=1
        var count = 1

        out[0]=str[str.startIndex]
        last = getMappingCode(s:str, index: 0)
        while (incount < str.characters.count && count < out.count) {
            mapped = getMappingCode(s:str, index: incount)
            incount += 1
            if (mapped != "0") {
                if (mapped != "0" && mapped != last) {
                    out[count]=mapped
                    count += 1
                }
            }
        }
        return String(out)
    }
}

Uso

let c = Soundex()

c.soundex(of:"Christopher") // C631

VBScript

Function getCode(c)
    Select Case c
        Case "B", "F", "P", "V"
            getCode = "1"
        Case "C", "G", "J", "K", "Q", "S", "X", "Z"
            getCode = "2"
        Case "D", "T"
            getCode = "3"
        Case "L"
            getCode = "4"
        Case "M", "N"
            getCode = "5"
        Case "R"
            getCode = "6"
    End Select
End Function
 
Function soundex(s)
    Dim code, previous
    code = UCase(Mid(s, 1, 1))
    previous = 7
    For i = 2 to (Len(s) + 1)
        current = getCode(UCase(Mid(s, i, 1)))
        If Len(current) > 0 And current <> previous Then
            code = code & current
        End If
        previous = current
    Next
    soundex = Mid(code, 1, 4)
    If Len(code) < 4 Then
        soundex = soundex & String(4 - Len(code), "0")
    End If
End Function

En caso de que conozcas la implementación del algoritmo soundex en otro lenguaje de programación que no este en esta lista (o que este mejor implementado que aquí), por favor compartelo con la comunidad en la caja de comentarios. Que te diviertas !

Esto podría ser de tu interes

Conviertete en un programador más sociable