Paste: jarowinklerdistance

Author: mrjbq7
Mode: text
Date: Sat, 21 Jun 2025 16:15:21
Plain Text |
function jarowinklerdistance(s1, s2)
    if length(s1) < length(s2)
        s1, s2 = s2, s1
    end
    len1, len2 = length(s1), length(s2)
    len2 == 0 && return 0.0
    delta = max(0, len1 ÷ 2 - 1)
    flag = zeros(Bool, len2)  # flags for possible transpositions, begin as false
    ch1_match = eltype(s1)[]
    for (i, ch1) in enumerate(s1)
        from = max(i - delta, 1)
        to = min(i + delta + 1, len2)
        for j in from:to
            if (!flag[j]) && (ch1 == s2[j])
                flag[j] = true
                push!(ch1_match, ch1)
                break
            end
        end
    end
    matches = length(ch1_match)
    matches == 0 && return 1.0
    transpositions, i = 0, 0
    for (j, ch2) in enumerate(s2)
        if flag[j]
            i += 1
            transpositions += (ch2 != ch1_match[i])
        end
    end
    jaro = (matches / len1 + matches / len2 + (matches - transpositions/2) / matches) / 3.0
    commonprefix = count(i -> s1[i] == s2[i], 1:min(len2, 4))
    return 1 - (jaro + commonprefix * 0.1 * (1 - jaro))
end

Annotation: faster

Author: mrjbq7
Mode: text
Date: Sat, 21 Jun 2025 19:04:00
Plain Text |
function jarowinklerdistance(t1::String, t2::String)
    s1::Vector{Char} = collect(t1)
    s2::Vector{Char} = collect(t2)
    if length(s1) < length(s2)
        s1, s2 = s2, s1
    end
    len1, len2 = length(s1), length(s2)
    len2 == 0 && return 0.0
    delta = max(0, len1 ÷ 2 - 1)
    flag = zeros(Bool, len2)  # flags for possible transpositions, begin as false
    ch1_match = eltype(s1)[]
    for (i, ch1) in enumerate(s1)
        from = max(i - delta, 1)
        to = min(i + delta + 1, len2)
        for j in from:to
            if (!flag[j]) && (ch1 == s2[j])
                flag[j] = true
                push!(ch1_match, ch1)
                break
            end
        end
    end
    matches = length(ch1_match)
    matches == 0 && return 1.0
    transpositions, i = 0, 0
    for (j, ch2) in enumerate(s2)
        if flag[j]
            i += 1
            transpositions += (ch2 != ch1_match[i])
        end
    end
    jaro = (matches / len1 + matches / len2 + (matches - transpositions/2) / matches) / 3.0
    commonprefix = count(i -> s1[i] == s2[i], 1:min(len2, 4))
    return 1 - (jaro + commonprefix * 0.1 * (1 - jaro))
end

New Annotation

Summary:
Author:
Mode:
Body: