package matchr func jaroWinklerBase(s1 string, s2 string, longTolerance bool, winklerize bool) (distance float64) { // index by code point, not byte r1 := []rune(s1) r2 := []rune(s2) r1Length := len(r1) r2Length := len(r2) if r1Length == 0 || r2Length == 0 { return } minLength := 0 if r1Length > r2Length { minLength = r1Length } else { minLength = r2Length } searchRange := minLength searchRange = (searchRange / 2) - 1 if searchRange < 0 { searchRange = 0 } var lowLim, hiLim, transCount, commonChars int var i, j, k int r1Flag := make([]bool, r1Length+1) r2Flag := make([]bool, r2Length+1) // find the common chars within the acceptable range commonChars = 0 for i, _ = range r1 { if i >= searchRange { lowLim = i - searchRange } else { lowLim = 0 } if (i + searchRange) <= (r2Length - 1) { hiLim = i + searchRange } else { hiLim = r2Length - 1 } for j := lowLim; j <= hiLim; j++ { if !r2Flag[j] && r2[j] == r1[i] { r2Flag[j] = true r1Flag[i] = true commonChars++ break } } } // if we have nothing in common at this point, nothing else can be done if commonChars == 0 { return } // otherwise we count the transpositions k = 0 transCount = 0 for i, _ := range r1 { if r1Flag[i] { for j = k; j < r2Length; j++ { if r2Flag[j] { k = j + 1 break } } if r1[i] != r2[j] { transCount++ } } } transCount /= 2 // adjust for similarities in nonmatched characters distance = float64(commonChars)/float64(r1Length) + float64(commonChars)/float64(r2Length) + (float64(commonChars-transCount))/float64(commonChars) distance /= 3.0 // give more weight to already-similar strings if winklerize && distance > 0.7 { // the first 4 characters in common if minLength >= 4 { j = 4 } else { j = minLength } for i = 0; i < j && len(r1) > i && len(r2) > i && r1[i] == r2[i] && nan(r1[i]); i++ { } if i > 0 { distance += float64(i) * 0.1 * (1.0 - distance) } if longTolerance && (minLength > 4) && (commonChars > i+1) && (2*commonChars >= minLength+i) { if nan(r1[0]) { distance += (1.0 - distance) * (float64(commonChars-i-1) / (float64(r1Length) + float64(r2Length) - float64(i*2) + 2)) } } } return } // Jaro computes the Jaro edit distance between two strings. It represents // this with a float64 between 0 and 1 inclusive, with 0 indicating the two // strings are not at all similar and 1 indicating the two strings are exact // matches. // // See http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance for a // full description. func Jaro(r1 string, r2 string) (distance float64) { return jaroWinklerBase(r1, r2, false, false) } // JaroWinkler computes the Jaro-Winkler edit distance between two strings. // This is a modification of the Jaro algorithm that gives additional weight // to prefix matches. func JaroWinkler(r1 string, r2 string, longTolerance bool) (distance float64) { return jaroWinklerBase(r1, r2, longTolerance, true) }