From 738a83349bcbd1e318b3e6d2c5701bffcddebafc Mon Sep 17 00:00:00 2001 From: Jana Chadt Date: Thu, 22 Jun 2023 12:23:18 +0200 Subject: [PATCH] Implement matching priority for licenses Based on usage statistics extracted from Flora.pm database dump on 22.06.23 --- ghcide/src/Text/Fuzzy/Parallel.hs | 38 ++++++++- .../src/Ide/Plugin/Cabal/Completions.hs | 79 ++++++++++++++++++- 2 files changed, 112 insertions(+), 5 deletions(-) diff --git a/ghcide/src/Text/Fuzzy/Parallel.hs b/ghcide/src/Text/Fuzzy/Parallel.hs index 0137861468..5f7ddc56db 100644 --- a/ghcide/src/Text/Fuzzy/Parallel.hs +++ b/ghcide/src/Text/Fuzzy/Parallel.hs @@ -1,8 +1,8 @@ -- | Parallel versions of 'filter' and 'simpleFilter' module Text.Fuzzy.Parallel -( filter, - simpleFilter, +( filter, filter', + simpleFilter, simpleFilter', match, Scored(..) ) where @@ -102,6 +102,40 @@ simpleFilter :: Int -- ^ Chunk size. 1000 works well. simpleFilter chunk maxRes pattern xs = filter chunk maxRes pattern xs id + +-- | The function to filter a list of values by fuzzy search on the text extracted from them. +filter' :: Int -- ^ Chunk size. 1000 works well. + -> Int -- ^ Max. number of results wanted + -> T.Text -- ^ Pattern. + -> [t] -- ^ The list of values containing the text to search in. + -> (t -> T.Text) -- ^ The function to extract the text from the container. + -> (T.Text -> T.Text -> Maybe Int) -- ^ Function to use for matching + -> [Scored t] -- ^ The list of results, sorted, highest score first. +filter' chunkSize maxRes pattern ts extract match' = partialSortByAscScore maxRes perfectScore (concat vss) + where + -- Preserve case for the first character, make all others lowercase + pattern' = case T.uncons pattern of + Just (c, rest) -> T.cons c (T.toLower rest) + _ -> pattern + vss = map (mapMaybe (\t -> flip Scored t <$> match' pattern' (extract t))) (chunkList chunkSize ts) + `using` parList (evalList rseq) + perfectScore = fromMaybe (error $ T.unpack pattern) $ match' pattern' pattern' + +-- | Return all elements of the list that have a fuzzy +-- match against the pattern, using a custom match function. Runs with default settings where +-- nothing is added around the matches, as case insensitive. +-- +-- >>> simpleFilter 1000 10 "vm" ["vim", "emacs", "virtual machine"] +-- [Scored {score = 4, original = "vim"},Scored {score = 4, original = "virtual machine"}] +{-# INLINABLE simpleFilter' #-} +simpleFilter' :: Int -- ^ Chunk size. 1000 works well. + -> Int -- ^ Max. number of results wanted + -> T.Text -- ^ Pattern to look for. + -> [T.Text] -- ^ List of texts to check. + -> (T.Text -> T.Text -> Maybe Int) -- ^ Function to use for matching + -> [Scored T.Text] -- ^ The ones that match. +simpleFilter' chunk maxRes pattern xs match' = + filter' chunk maxRes pattern xs id match' -------------------------------------------------------------------------------- chunkList :: Int -> [a] -> [[a]] diff --git a/plugins/hls-cabal-plugin/src/Ide/Plugin/Cabal/Completions.hs b/plugins/hls-cabal-plugin/src/Ide/Plugin/Cabal/Completions.hs index df152251c8..7cf03ff724 100644 --- a/plugins/hls-cabal-plugin/src/Ide/Plugin/Cabal/Completions.hs +++ b/plugins/hls-cabal-plugin/src/Ide/Plugin/Cabal/Completions.hs @@ -13,6 +13,7 @@ import Data.Maybe (fromMaybe) import qualified Data.Text as T import Data.Text.Utf16.Rope (Rope) import qualified Data.Text.Utf16.Rope as Rope +import Debug.Trace (traceShowM) import Development.IDE as D import Distribution.CabalSpecVersion (CabalSpecVersion (CabalSpecV2_2), showCabalSpecVersion) @@ -25,6 +26,7 @@ import qualified Language.LSP.Protocol.Types as Compls (CompletionItem import qualified Language.LSP.Protocol.Types as LSP import qualified Language.LSP.VFS as VFS import qualified Text.Fuzzy.Parallel as Fuzzy +import Data.Ord (Down(..)) -- ---------------------------------------------------------------- -- Public API for Completions @@ -265,8 +267,29 @@ constantCompleter completions _ ctxInfo = do let range = completionRange ctxInfo pure $ map (makeSimpleCabalCompletionItem range . Fuzzy.original) scored +weightedConstantCompleter :: [T.Text] -> Map T.Text Double -> Completer +weightedConstantCompleter completions weights _ ctxInfo = do + let scored = if perfectScore > 0 + then fmap Fuzzy.original $ Fuzzy.simpleFilter' 1000 10 prefix completions customMatch + else topTenByWeight + let range = completionRange ctxInfo + pure $ map (makeSimpleCabalCompletionItem range) scored + where + prefix = completionPrefix ctxInfo + perfectScore = fromMaybe (error "match is broken") $ Fuzzy.match prefix prefix + customMatch :: (T.Text -> T.Text -> Maybe Int) + customMatch toSearch searchSpace = do + matched <- Fuzzy.match toSearch searchSpace + let weight = fromMaybe 0 $ Map.lookup searchSpace weights + let score = min + perfectScore + (round (fromIntegral matched * (1 + weight))) + pure score + topTenByWeight :: [T.Text] + topTenByWeight = take 10 $ map fst $ List.sortOn (Down . snd) $ Map.assocs weights + {- | Completer to be used when a file path can be - completed for a field, takes the file path of the directory to start from. + completed for a field, takes the file path of the directory to start from. Completes file paths as well as directories. -} filePathCompleter :: Completer @@ -327,7 +350,7 @@ cabalKeywords = [ ("name:", noopCompleter) -- TODO: should complete to filename, needs meta info , ("version:", noopCompleter) , ("build-type:", constantCompleter ["Simple", "Custom", "Configure", "Make"]) - , ("license:", constantCompleter licenseNames) + , ("license:", weightedConstantCompleter licenseNames weightedLicenseNames) , ("license-file:", filePathCompleter) , ("license-files:", filePathCompleter) -- list of filenames , ("copyright:", noopCompleter) @@ -411,7 +434,7 @@ stanzaKeywordMap = ) , ( "source-repository" - , Map.fromList $ + , Map.fromList [ ( "type:" , constantCompleter @@ -476,6 +499,56 @@ stanzaKeywordMap = , ("mixins:", noopCompleter) ] +weightedLicenseNames :: Map T.Text Double +weightedLicenseNames = fmap statisticsToWeight $ Map.fromList + [("BSD-3-Clause",9955) + , ("MIT",3336) + , ("GPL-3.0-only",679) + , ("LicenseRef-OtherLicense",521) + , ("Apache-2.0",514) + , ("LicenseRef-GPL",443) + , ("LicenseRef-PublicDomain",318) + , ("MPL-2.0",288) + , ("BSD-2-Clause",174) + , ("GPL-2.0-only",160) + , ("LicenseRef-LGPL",146) + , ("LGPL-2.1-only",112) + , ("LGPL-3.0-only",100) + , ("AGPL-3.0-only",96) + , ("ISC",89) + , ("LicenseRef-Apache",45) + , ("GPL-3.0-or-later",43) + , ("BSD-2-Clause-Patent",33) + , ("GPL-2.0-or-later",21) + , ("CC0-1.0",16) + , ("AGPL-3.0-or-later",15) + , ("LGPL-2.1-or-later",12) + , ("(BSD-2-Clause OR Apache-2.0)",10) + , ("(Apache-2.0 OR MPL-2.0)",8) + , ("LicenseRef-AGPL",6) + , ("(BSD-3-Clause OR Apache-2.0)",4) + , ("0BSD",3) + , ("BSD-4-Clause",3) + , ("LGPL-3.0-or-later",3) + , ("LicenseRef-LGPL-2",2) + , ("GPL-2.0-or-later AND BSD-3-Clause",2) + , ("NONE",2) + , ("Zlib",2) + , ("(Apache-2.0 OR BSD-3-Clause)",2) + , ("BSD-3-Clause AND GPL-2.0-or-later",2) + , ("BSD-3-Clause AND GPL-3.0-or-later",2) + ] + where + statisticsToWeight :: Int -> Double + statisticsToWeight stat + | stat < 10 = 0.1 + | stat < 20 = 0.3 + | stat < 50 = 0.4 + | stat < 100 = 0.5 + | stat < 500 = 0.6 + | stat < 650 = 0.7 + | otherwise = 0.9 + -- cabalFlagKeywords :: [(T.Text, T.Text)] -- cabalFlagKeywords = -- [