2016-05-16

Jaro-Winkler in PL/pgSQL

We needed to do a fuzzy text match in PostgreSQL, but the fuzzy search algorithms that are available in the extensions that come in the Debian Postgres packages are pretty crude. Jaro-Winkler distance is available in the extension pg_similarity, but that requires doing make; make install, which makes installation cumbersome our development and production environments. An alternative is installing postgresql-plpython and using the Jaro-Winkler distance from the jellyfish library, but that reqires installing the package and enabling the pg_plpythonu extension on all databases. So I took some time and ported the algorithm to PL/pgSQL, which does not require any extensions:
-- Ported from jellyfish/_jellyfish.py

CREATE OR REPLACE FUNCTION jaro_winkler(ying TEXT, yang TEXT)
    RETURNS float8 AS $$
DECLARE
    ying_len integer := LENGTH(ying);
    yang_len integer := LENGTH(yang);
    min_len integer := GREATEST(ying_len, yang_len);
    search_range integer;
    ying_flags bool[];
    yang_flags bool[];
    common_chars float8 := 0;
    ying_ch TEXT;
    hi integer;
    low integer;
    trans_count integer := 0;
    weight float8;
    i integer;
    j integer;
    jj integer;
    k integer;
BEGIN

    IF ying_len = 0 OR yang_len = 0 THEN
        RETURN 0;
    END IF;

    search_range := (GREATEST(ying_len, yang_len) / 2) - 1;
    IF search_range < 0 THEN
        search_range := 0;
    END IF;
    FOR i IN 1 .. ying_len LOOP
        ying_flags[i] := false;
    END LOOP;
    FOR i IN 1 .. yang_len LOOP
        yang_flags[i] := false;
    END LOOP;

    -- looking only within search range, count & flag matched pairs
    FOR i in 1 .. ying_len LOOP
        ying_ch := SUBSTRING(ying FROM i for 1);
        IF i > search_range THEN
            low := i - search_range;
        ELSE
            low := 1;
        END IF;
        IF i + search_range <= yang_len THEN
            hi := i + search_range;
        ELSE
            hi := yang_len;
        END IF;
        <<inner>>
        FOR j IN low .. hi LOOP
            IF NOT yang_flags[j] AND
                 SUBSTRING(yang FROM j FOR 1) = ying_ch THEN
               ying_flags[i] := true;
               yang_flags[j] := true;
               common_chars := common_chars + 1;
               EXIT inner;
            END IF;
        END LOOP inner;
    END LOOP;
    -- short circuit if no characters match
    IF common_chars = 0 THEN
        RETURN 0;
    END IF;

    -- count transpositions
    k := 1;
    FOR i IN 1 .. ying_len LOOP
        IF ying_flags[i] THEN
            <<inner2>>
            FOR j IN k .. yang_len LOOP
                jj := j;
                IF yang_flags[j] THEN
                    k := j + 1;
                    EXIT inner2;
                END IF;
            END LOOP;
            IF SUBSTRING(ying FROM i FOR 1) <>
                    SUBSTRING(yang FROM jj FOR 1) THEN
                trans_count := trans_count + 1;
            END IF;
        END IF;
    END LOOP;
    trans_count := trans_count / 2;

    -- adjust for similarities in nonmatched characters
    weight := ((common_chars/ying_len + common_chars/yang_len +
               (common_chars-trans_count) / common_chars)) / 3;

    -- winkler modification: continue to boost if strings are similar
    IF weight > 0.7 AND ying_len > 3 AND yang_len > 3 THEN
       -- adjust for up to first 4 chars in common
       j := LEAST(min_len, 4);
       i := 1;
       WHILE i - 1 < j AND
             SUBSTRING(ying FROM i FOR 1) = SUBSTRING(yang FROM i FOR 1) LOOP
           i := i + 1;
       END LOOP;
       weight := weight + (i - 1) * 0.1 * (1.0 - weight);
    END IF;

    RETURN weight;

END;
$$
LANGUAGE plpgsql;