-- |
-- Module      :  Data.Text.Metrics
-- Copyright   :  © 2016–2017 Mark Karpov
-- License     :  BSD 3 clause
--
-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
-- Stability   :  experimental
-- Portability :  portable
--
-- The module provides efficient implementations of various strings metric
-- algorithms. It works with strict 'Text' values.
--
-- __Note__: before version /0.3.0/ the package used C implementations of
-- the algorithms under the hood. Beginning from version /0.3.0/, the
-- implementations are written in Haskell while staying almost as fast, see:
--
-- <https://markkarpov.com/post/migrating-text-metrics.html>

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP          #-}
{-# LANGUAGE MultiWayIf   #-}

module Data.Text.Metrics
  ( -- * Levenshtein variants
    levenshtein
  , levenshteinNorm
  , damerauLevenshtein
  , damerauLevenshteinNorm
    -- * Treating inputs like sets
  , overlap
  , jaccard
    -- * Other
  , hamming
  , jaro
  , jaroWinkler )
where

import Control.Monad
import Control.Monad.ST
import Data.Map.Strict (Map)
import Data.Ratio
import Data.Text
import GHC.Exts (inline)
import qualified Data.Map.Strict as M
import qualified Data.Text                   as T
import qualified Data.Text.Unsafe            as TU
import qualified Data.Vector.Unboxed.Mutable as VUM

#if !MIN_VERSION_base(4,8,0)
import Control.Applicative
#endif

----------------------------------------------------------------------------
-- Levenshtein variants

-- | Return Levenshtein distance between two 'Text' values. Classic
-- Levenshtein distance between two strings is the minimal number of
-- operations necessary to transform one string into another. For
-- Levenshtein distance allowed operations are: deletion, insertion, and
-- substitution.
--
-- See also: <https://en.wikipedia.org/wiki/Levenshtein_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned
-- 'Data.Numeric.Natural'.

levenshtein :: Text -> Text -> Int
levenshtein :: Text -> Text -> Int
levenshtein a :: Text
a b :: Text
b = (Int, Int) -> Int
forall a b. (a, b) -> a
fst (Text -> Text -> (Int, Int)
levenshtein_ Text
a Text
b)

-- | Return normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
-- strings, while 1 means exact match.
--
-- See also: <https://en.wikipedia.org/wiki/Levenshtein_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
-- 'Data.Numeric.Natural'@.

levenshteinNorm :: Text -> Text -> Ratio Int
levenshteinNorm :: Text -> Text -> Ratio Int
levenshteinNorm = (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int
norm Text -> Text -> (Int, Int)
levenshtein_

-- | An internal helper, returns Levenshtein distance as the first element
-- of the tuple and max length of the two inputs as the second element of
-- the tuple.

levenshtein_ :: Text -> Text -> (Int, Int)
levenshtein_ :: Text -> Text -> (Int, Int)
levenshtein_ a :: Text
a b :: Text
b
  | Text -> Bool
T.null Text
a = (Int
lenb, Int
lenm)
  | Text -> Bool
T.null Text
b = (Int
lena, Int
lenm)
  | Bool
otherwise = (forall s. ST s (Int, Int)) -> (Int, Int)
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s (Int, Int)) -> (Int, Int))
-> (forall s. ST s (Int, Int)) -> (Int, Int)
forall a b. (a -> b) -> a -> b
$ do
      let v_len :: Int
v_len = Int
lenb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1
      MVector s Int
v <- Int -> ST s (MVector (PrimState (ST s)) Int)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.unsafeNew (Int
v_len Int -> Int -> Int
forall a. Num a => a -> a -> a
* 2)
      let gov :: Int -> ST s ()
gov !Int
i =
            Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
v_len) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
              MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v Int
i Int
i
              Int -> ST s ()
gov (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1)
          goi :: Int -> Int -> Int -> Int -> ST s ()
goi !Int
i !Int
na !Int
v0 !Int
v1 = do
            let !(TU.Iter ai :: Char
ai da :: Int
da) = Text -> Int -> Iter
TU.iter Text
a Int
na
                goj :: Int -> Int -> ST s ()
goj !Int
j !Int
nb =
                  Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
j Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lenb) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
                    let !(TU.Iter bj :: Char
bj db :: Int
db) = Text -> Int -> Iter
TU.iter Text
b Int
nb
                        cost :: Int
cost = if Char
ai Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
bj then 0 else 1
                    Int
x <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j)
                    Int
y <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1)
                    Int
z <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
cost) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j)
                    MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
x (Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
y Int
z))
                    Int -> Int -> ST s ()
goj (Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (Int
nb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
db)
            Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lena) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
              MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v Int
v1 (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1)
              Int -> Int -> ST s ()
goj 0 0
              Int -> Int -> Int -> Int -> ST s ()
goi (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (Int
na Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
da) Int
v1 Int
v0
      Int -> ST s ()
gov 0
      Int -> Int -> Int -> Int -> ST s ()
goi 0 0 0 Int
v_len
      Int
ld <- MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
lenb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ if Int -> Bool
forall a. Integral a => a -> Bool
even Int
lena then 0 else Int
v_len)
      (Int, Int) -> ST s (Int, Int)
forall (m :: * -> *) a. Monad m => a -> m a
return (Int
ld, Int
lenm)
  where
    lena :: Int
lena = Text -> Int
T.length Text
a
    lenb :: Int
lenb = Text -> Int
T.length Text
b
    lenm :: Int
lenm = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
lena Int
lenb
{-# INLINE levenshtein_ #-}

-- | Return Damerau-Levenshtein distance between two 'Text' values. The
-- function works like 'levenshtein', but the collection of allowed
-- operations also includes transposition of two /adjacent/ characters.
--
-- See also: <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned
-- 'Data.Numeric.Natural'.

damerauLevenshtein :: Text -> Text -> Int
damerauLevenshtein :: Text -> Text -> Int
damerauLevenshtein a :: Text
a b :: Text
b = (Int, Int) -> Int
forall a b. (a, b) -> a
fst (Text -> Text -> (Int, Int)
damerauLevenshtein_ Text
a Text
b)

-- | Return normalized Damerau-Levenshtein distance between two 'Text'
-- values. 0 signifies no similarity between the strings, while 1 means
-- exact match.
--
-- See also: <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
-- 'Data.Numeric.Natural'@.

damerauLevenshteinNorm :: Text -> Text -> Ratio Int
damerauLevenshteinNorm :: Text -> Text -> Ratio Int
damerauLevenshteinNorm = (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int
norm Text -> Text -> (Int, Int)
damerauLevenshtein_

-- | An internal helper, returns Damerau-Levenshtein distance as the first
-- element of the tuple and max length of the two inputs as the second
-- element of the tuple.

damerauLevenshtein_ :: Text -> Text -> (Int, Int)
damerauLevenshtein_ :: Text -> Text -> (Int, Int)
damerauLevenshtein_ a :: Text
a b :: Text
b
  | Text -> Bool
T.null Text
a = (Int
lenb, Int
lenm)
  | Text -> Bool
T.null Text
b = (Int
lena, Int
lenm)
  | Bool
otherwise = (forall s. ST s (Int, Int)) -> (Int, Int)
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s (Int, Int)) -> (Int, Int))
-> (forall s. ST s (Int, Int)) -> (Int, Int)
forall a b. (a -> b) -> a -> b
$ do
      let v_len :: Int
v_len = Int
lenb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1
      MVector s Int
v <- Int -> ST s (MVector (PrimState (ST s)) Int)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.unsafeNew (Int
v_len Int -> Int -> Int
forall a. Num a => a -> a -> a
* 3)
      let gov :: Int -> ST s ()
gov !Int
i =
            Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
v_len) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
              MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v Int
i Int
i
              Int -> ST s ()
gov (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1)
          goi :: Int -> Int -> Char -> Int -> Int -> Int -> ST s ()
goi !Int
i !Int
na !Char
ai_1 !Int
v0 !Int
v1 !Int
v2 = do
            let !(TU.Iter ai :: Char
ai da :: Int
da) = Text -> Int -> Iter
TU.iter Text
a Int
na
                goj :: Int -> Int -> Char -> ST s ()
goj !Int
j !Int
nb !Char
bj_1 =
                  Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
j Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lenb) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
                    let !(TU.Iter bj :: Char
bj db :: Int
db) = Text -> Int -> Iter
TU.iter Text
b Int
nb
                        cost :: Int
cost = if Char
ai Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
bj then 0 else 1
                    Int
x <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j)
                    Int
y <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1)
                    Int
z <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
cost) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j)
                    let g :: Int
g = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
x (Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
y Int
z)
                    Int
val <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
cost) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v2 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
- 1)
                    MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (Int -> ST s ()) -> Int -> ST s ()
forall a b. (a -> b) -> a -> b
$
                      if Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 0 Bool -> Bool -> Bool
&& Int
j Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 0 Bool -> Bool -> Bool
&& Char
ai Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
bj_1 Bool -> Bool -> Bool
&& Char
ai_1 Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
bj Bool -> Bool -> Bool
&& Int
val Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
g
                        then Int
val
                        else Int
g
                    Int -> Int -> Char -> ST s ()
goj (Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (Int
nb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
db) Char
bj
            Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lena) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
              MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v Int
v1 (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1)
              Int -> Int -> Char -> ST s ()
goj 0 0 'a'
              Int -> Int -> Char -> Int -> Int -> Int -> ST s ()
goi (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (Int
na Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
da) Char
ai Int
v1 Int
v2 Int
v0
      Int -> ST s ()
gov 0
      Int -> Int -> Char -> Int -> Int -> Int -> ST s ()
goi 0 0 'a' 0 Int
v_len (Int
v_len Int -> Int -> Int
forall a. Num a => a -> a -> a
* 2)
      Int
ld <- MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
lenb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
lena Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` 3) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
v_len)
      (Int, Int) -> ST s (Int, Int)
forall (m :: * -> *) a. Monad m => a -> m a
return (Int
ld, Int
lenm)
  where
    lena :: Int
lena = Text -> Int
T.length Text
a
    lenb :: Int
lenb = Text -> Int
T.length Text
b
    lenm :: Int
lenm = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
lena Int
lenb
{-# INLINE damerauLevenshtein_ #-}

----------------------------------------------------------------------------
-- Treating inputs like sets

-- | Return overlap coefficient for two 'Text' values. Returned value is in
-- the range from 0 (no similarity) to 1 (exact match). Return 1 if both
-- 'Text' values are empty.
--
-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
--
-- @since 0.3.0

overlap :: Text -> Text -> Ratio Int
overlap :: Text -> Text -> Ratio Int
overlap a :: Text
a b :: Text
b =
  if Int
d Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 0
    then 1 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% 1
    else Map Char Int -> Map Char Int -> Int
intersectionSize (Text -> Map Char Int
mkTextMap Text
a) (Text -> Map Char Int
mkTextMap Text
b) Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
d
  where
    d :: Int
d = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min (Text -> Int
T.length Text
a) (Text -> Int
T.length Text
b)

-- | Return Jaccard similarity coefficient for two 'Text' values. Returned
-- value is in the range from 0 (no similarity) to 1 (exact match). Return 1
-- if both
--
-- See also: <https://en.wikipedia.org/wiki/Jaccard_index>
--
-- @since 0.3.0

jaccard :: Text -> Text -> Ratio Int
jaccard :: Text -> Text -> Ratio Int
jaccard a :: Text
a b :: Text
b =
  if Int
d Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 0
    then 1 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% 1
    else Map Char Int -> Map Char Int -> Int
intersectionSize Map Char Int
ma Map Char Int
mb Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
d
  where
    ma :: Map Char Int
ma = Text -> Map Char Int
mkTextMap Text
a
    mb :: Map Char Int
mb = Text -> Map Char Int
mkTextMap Text
b
    d :: Int
d  = Map Char Int -> Map Char Int -> Int
unionSize Map Char Int
ma Map Char Int
mb

-- | Make a map from 'Char' to 'Int' representing how many times the 'Char'
-- appears in the input 'Text'.

mkTextMap :: Text -> Map Char Int
mkTextMap :: Text -> Map Char Int
mkTextMap = (Map Char Int -> Char -> Map Char Int)
-> Map Char Int -> Text -> Map Char Int
forall a. (a -> Char -> a) -> a -> Text -> a
T.foldl' Map Char Int -> Char -> Map Char Int
forall k a. (Ord k, Num a) => Map k a -> k -> Map k a
f Map Char Int
forall k a. Map k a
M.empty
  where
    f :: Map k a -> k -> Map k a
f m :: Map k a
m ch :: k
ch = (a -> a -> a) -> k -> a -> Map k a -> Map k a
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
M.insertWith a -> a -> a
forall a. Num a => a -> a -> a
(+) k
ch 1 Map k a
m
{-# INLINE mkTextMap #-}

-- | Return intersection size between two 'Text'-maps.

intersectionSize :: Map Char Int -> Map Char Int -> Int
intersectionSize :: Map Char Int -> Map Char Int -> Int
intersectionSize a :: Map Char Int
a b :: Map Char Int
b = (Int -> Int -> Int) -> Int -> Map Char Int -> Int
forall a b k. (a -> b -> a) -> a -> Map k b -> a
M.foldl' Int -> Int -> Int
forall a. Num a => a -> a -> a
(+) 0 ((Int -> Int -> Int) -> Map Char Int -> Map Char Int -> Map Char Int
forall k a b c.
Ord k =>
(a -> b -> c) -> Map k a -> Map k b -> Map k c
M.intersectionWith Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Map Char Int
a Map Char Int
b)
{-# INLINE intersectionSize #-}

-- | Return union size between two 'Text'-maps.

unionSize :: Map Char Int -> Map Char Int -> Int
unionSize :: Map Char Int -> Map Char Int -> Int
unionSize a :: Map Char Int
a b :: Map Char Int
b = (Int -> Int -> Int) -> Int -> Map Char Int -> Int
forall a b k. (a -> b -> a) -> a -> Map k b -> a
M.foldl' Int -> Int -> Int
forall a. Num a => a -> a -> a
(+) 0 ((Int -> Int -> Int) -> Map Char Int -> Map Char Int -> Map Char Int
forall k a. Ord k => (a -> a -> a) -> Map k a -> Map k a -> Map k a
M.unionWith Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Map Char Int
a Map Char Int
b)
{-# INLINE unionSize #-}

----------------------------------------------------------------------------
-- Other

-- | /O(n)/ Return Hamming distance between two 'Text' values. Hamming
-- distance is defined as the number of positions at which the corresponding
-- symbols are different. The input 'Text' values should be of equal length
-- or 'Nothing' will be returned.
--
-- See also: <https://en.wikipedia.org/wiki/Hamming_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned @'Maybe'
-- 'Data.Numeric.Natural'@.

hamming :: Text -> Text -> Maybe Int
hamming :: Text -> Text -> Maybe Int
hamming a :: Text
a b :: Text
b =
  if Text -> Int
T.length Text
a Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Text -> Int
T.length Text
b
    then Int -> Maybe Int
forall a. a -> Maybe a
Just (Int -> Int -> Int -> Int
forall t. Num t => Int -> Int -> t -> t
go 0 0 0)
    else Maybe Int
forall a. Maybe a
Nothing
  where
    go :: Int -> Int -> t -> t
go !Int
na !Int
nb !t
r =
      let !(TU.Iter cha :: Char
cha da :: Int
da) = Text -> Int -> Iter
TU.iter Text
a Int
na
          !(TU.Iter chb :: Char
chb db :: Int
db) = Text -> Int -> Iter
TU.iter Text
b Int
nb
      in if | Int
na  Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
len -> t
r
            | Char
cha Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
chb -> Int -> Int -> t -> t
go (Int
na Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
da) (Int
nb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
db) (t
r t -> t -> t
forall a. Num a => a -> a -> a
+ 1)
            | Bool
otherwise  -> Int -> Int -> t -> t
go (Int
na Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
da) (Int
nb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
db) t
r
    len :: Int
len = Text -> Int
TU.lengthWord16 Text
a

-- | Return Jaro distance between two 'Text' values. Returned value is in
-- the range from 0 (no similarity) to 1 (exact match).
--
-- While the algorithm is pretty clear for artificial examples (like those
-- from the linked Wikipedia article), for /arbitrary/ strings, it may be
-- hard to decide which of two strings should be considered as one having
-- “reference” order of characters (order of matching characters in an
-- essential part of the definition of the algorithm). This makes us
-- consider the first string the “reference” string (with correct order of
-- characters). Thus generally,
--
-- > jaro a b ≠ jaro b a
--
-- This asymmetry can be found in all implementations of the algorithm on
-- the internet, AFAIK.
--
-- See also: <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
--
-- @since 0.2.0
--
-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
-- 'Data.Numeric.Natural'@.

jaro :: Text -> Text -> Ratio Int
jaro :: Text -> Text -> Ratio Int
jaro a :: Text
a b :: Text
b =
  if Text -> Bool
T.null Text
a Bool -> Bool -> Bool
|| Text -> Bool
T.null Text
b
    then 0 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% 1
    else (forall s. ST s (Ratio Int)) -> Ratio Int
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s (Ratio Int)) -> Ratio Int)
-> (forall s. ST s (Ratio Int)) -> Ratio Int
forall a b. (a -> b) -> a -> b
$ do
      let lena :: Int
lena = Text -> Int
T.length Text
a
          lenb :: Int
lenb = Text -> Int
T.length Text
b
          d :: Int
d =
            if Int
lena Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= 2 Bool -> Bool -> Bool
&& Int
lenb Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= 2
              then Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
lena Int
lenb Int -> Int -> Int
forall a. Integral a => a -> a -> a
`quot` 2 Int -> Int -> Int
forall a. Num a => a -> a -> a
- 1
              else 0
      MVector s Int
v <- Int -> Int -> ST s (MVector (PrimState (ST s)) Int)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> a -> m (MVector (PrimState m) a)
VUM.replicate Int
lenb (0 :: Int)
      MVector s Int
r <- Int -> Int -> ST s (MVector (PrimState (ST s)) Int)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> a -> m (MVector (PrimState m) a)
VUM.replicate 3 (0 :: Int) -- tj, m, t
      let goi :: Int -> Int -> Int -> ST s ()
goi !Int
i !Int
na !Int
fromb = do
            let !(TU.Iter ai :: Char
ai da :: Int
da) = Text -> Int -> Iter
TU.iter Text
a Int
na
                (from :: Int
from, fromb' :: Int
fromb') =
                  if Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
d
                    then (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
d, Int
fromb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Text -> Int -> Int
TU.iter_ Text
b Int
fromb)
                    else (0, 0)
                to :: Int
to = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
d Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) Int
lenb
                goj :: Int -> Int -> ST s ()
goj !Int
j !Int
nb =
                  Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
j Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
to) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
                    let !(TU.Iter bj :: Char
bj db :: Int
db) = Text -> Int -> Iter
TU.iter Text
b Int
nb
                    Bool
used <- (Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 1) (Int -> Bool) -> ST s Int -> ST s Bool
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v Int
j
                    if Bool -> Bool
not Bool
used Bool -> Bool -> Bool
&& Char
ai Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
bj
                      then do
                        Int
tj <- MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
r 0
                        if Int
j Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
tj
                          then MVector (PrimState (ST s)) Int -> (Int -> Int) -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> (a -> a) -> Int -> m ()
VUM.unsafeModify MVector s Int
MVector (PrimState (ST s)) Int
r (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) 2
                          else MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite  MVector s Int
MVector (PrimState (ST s)) Int
r 0 Int
j
                        MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v Int
j 1
                        MVector (PrimState (ST s)) Int -> (Int -> Int) -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> (a -> a) -> Int -> m ()
VUM.unsafeModify MVector s Int
MVector (PrimState (ST s)) Int
r (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) 1
                      else Int -> Int -> ST s ()
goj (Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (Int
nb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
db)
            Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lena) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
              Int -> Int -> ST s ()
goj Int
from Int
fromb
              Int -> Int -> Int -> ST s ()
goi (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (Int
na Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
da) Int
fromb'
      Int -> Int -> Int -> ST s ()
goi 0 0 0
      Int
m <- MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
r 1
      Int
t <- MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
r 2
      Ratio Int -> ST s (Ratio Int)
forall (m :: * -> *) a. Monad m => a -> m a
return (Ratio Int -> ST s (Ratio Int)) -> Ratio Int -> ST s (Ratio Int)
forall a b. (a -> b) -> a -> b
$
        if Int
m Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 0
          then 0 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% 1
          else ((Int
m Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
lena) Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
+
                (Int
m Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
lenb) Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
+
                ((Int
m Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
t) Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
m)) Ratio Int -> Ratio Int -> Ratio Int
forall a. Fractional a => a -> a -> a
/ 3

-- | Return Jaro-Winkler distance between two 'Text' values. Returned value
-- is in range from 0 (no similarity) to 1 (exact match).
--
-- See also: <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
--
-- @since 0.2.0
--
-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
-- 'Data.Numeric.Natural'@.

jaroWinkler :: Text -> Text -> Ratio Int
jaroWinkler :: Text -> Text -> Ratio Int
jaroWinkler a :: Text
a b :: Text
b = Ratio Int
dj Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
+ (1 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% 10) Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
* Ratio Int
l Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
* (1 Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
- Ratio Int
dj)
  where
    dj :: Ratio Int
dj = Ratio Int -> Ratio Int
forall a. a -> a
inline (Text -> Text -> Ratio Int
jaro Text
a Text
b)
    l :: Ratio Int
l  = Int -> Ratio Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Text -> Text -> Int
commonPrefix Text
a Text
b)

-- | Return length of common prefix two 'Text' values have.

commonPrefix :: Text -> Text -> Int
commonPrefix :: Text -> Text -> Int
commonPrefix a :: Text
a b :: Text
b = Int -> Int -> Int -> Int
forall t. Num t => Int -> Int -> t -> t
go 0 0 0
  where
    go :: Int -> Int -> t -> t
go !Int
na !Int
nb !t
r =
      let !(TU.Iter cha :: Char
cha da :: Int
da) = Text -> Int -> Iter
TU.iter Text
a Int
na
          !(TU.Iter chb :: Char
chb db :: Int
db) = Text -> Int -> Iter
TU.iter Text
b Int
nb
      in if | Int
na Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
lena -> t
r
            | Int
nb Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
lenb -> t
r
            | Char
cha Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
chb -> Int -> Int -> t -> t
go (Int
na Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
da) (Int
nb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
db) (t
r t -> t -> t
forall a. Num a => a -> a -> a
+ 1)
            | Bool
otherwise  -> t
r
    lena :: Int
lena = Text -> Int
TU.lengthWord16 Text
a
    lenb :: Int
lenb = Text -> Int
TU.lengthWord16 Text
b
{-# INLINE commonPrefix #-}

----------------------------------------------------------------------------
-- Helpers

norm :: (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int
norm :: (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int
norm f :: Text -> Text -> (Int, Int)
f a :: Text
a b :: Text
b =
  let (r :: Int
r, l :: Int
l) = Text -> Text -> (Int, Int)
f Text
a Text
b
  in if Int
r Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 0
       then 1 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% 1
       else 1 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% 1 Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
- Int
r Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
l
{-# INLINE norm #-}