We needed to do a fuzzy text match in PostgreSQL, but the fuzzy search algorithms that are available in the extensions that come in the Debian Postgres packages are pretty crude. Jaro-Winkler distance is available in the extension
pg_similarity, but that requires doing
make; make install
, which makes installation cumbersome our development and production environments.
An alternative is installing
postgresql-plpython
and using the Jaro-Winkler distance from the
jellyfish
library, but that reqires installing the package and enabling the
pg_plpythonu
extension on all databases. So I took some time and ported the algorithm to PL/pgSQL, which does not require any extensions:
-- Ported from jellyfish/_jellyfish.py
CREATE OR REPLACE FUNCTION jaro_winkler(ying TEXT, yang TEXT)
RETURNS float8 AS $$
DECLARE
ying_len integer := LENGTH(ying);
yang_len integer := LENGTH(yang);
min_len integer := GREATEST(ying_len, yang_len);
search_range integer;
ying_flags bool[];
yang_flags bool[];
common_chars float8 := 0;
ying_ch TEXT;
hi integer;
low integer;
trans_count integer := 0;
weight float8;
i integer;
j integer;
jj integer;
k integer;
BEGIN
IF ying_len = 0 OR yang_len = 0 THEN
RETURN 0;
END IF;
search_range := (GREATEST(ying_len, yang_len) / 2) - 1;
IF search_range < 0 THEN
search_range := 0;
END IF;
FOR i IN 1 .. ying_len LOOP
ying_flags[i] := false;
END LOOP;
FOR i IN 1 .. yang_len LOOP
yang_flags[i] := false;
END LOOP;
-- looking only within search range, count & flag matched pairs
FOR i in 1 .. ying_len LOOP
ying_ch := SUBSTRING(ying FROM i for 1);
IF i > search_range THEN
low := i - search_range;
ELSE
low := 1;
END IF;
IF i + search_range <= yang_len THEN
hi := i + search_range;
ELSE
hi := yang_len;
END IF;
<<inner>>
FOR j IN low .. hi LOOP
IF NOT yang_flags[j] AND
SUBSTRING(yang FROM j FOR 1) = ying_ch THEN
ying_flags[i] := true;
yang_flags[j] := true;
common_chars := common_chars + 1;
EXIT inner;
END IF;
END LOOP inner;
END LOOP;
-- short circuit if no characters match
IF common_chars = 0 THEN
RETURN 0;
END IF;
-- count transpositions
k := 1;
FOR i IN 1 .. ying_len LOOP
IF ying_flags[i] THEN
<<inner2>>
FOR j IN k .. yang_len LOOP
jj := j;
IF yang_flags[j] THEN
k := j + 1;
EXIT inner2;
END IF;
END LOOP;
IF SUBSTRING(ying FROM i FOR 1) <>
SUBSTRING(yang FROM jj FOR 1) THEN
trans_count := trans_count + 1;
END IF;
END IF;
END LOOP;
trans_count := trans_count / 2;
-- adjust for similarities in nonmatched characters
weight := ((common_chars/ying_len + common_chars/yang_len +
(common_chars-trans_count) / common_chars)) / 3;
-- winkler modification: continue to boost if strings are similar
IF weight > 0.7 AND ying_len > 3 AND yang_len > 3 THEN
-- adjust for up to first 4 chars in common
j := LEAST(min_len, 4);
i := 1;
WHILE i - 1 < j AND
SUBSTRING(ying FROM i FOR 1) = SUBSTRING(yang FROM i FOR 1) LOOP
i := i + 1;
END LOOP;
weight := weight + (i - 1) * 0.1 * (1.0 - weight);
END IF;
RETURN weight;
END;
$$
LANGUAGE plpgsql;
3 comments:
If you try with the strings 'Amazon' and 'amaozn', it gives 0.755555555555556, while it should return 0.961111111111111.
I checked this against the default select UTL_MATCH.jaro_winkler('amaozn','Amazon') from dual; that Oracle 12C comes with and it returns 0.75555555555555
Late to the party (as usual) but both are correct.
Comparing 'amaozn' and 'amazon' returns 0.961111111111111.
Comparing 'amaozn' and 'Amazon' returns 0.755555555555556.
Post a Comment