From 44f21e50cb5939da7af61860bcff1861ab54daff Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Tue, 13 Dec 2022 13:49:52 +0500 Subject: [PATCH] [#64] Implement copy/paste protection checks Problem: Currently xrefcheck is not able to detect possible bad copy-pastes, when some links are referring the same file, but from the link names it seems that one of those links should refer other file. Solution: Implement check, add corresponding settings to the config. --- src/Xrefcheck/Command.hs | 5 +- src/Xrefcheck/Config.hs | 5 + src/Xrefcheck/Config/Default.hs | 3 + src/Xrefcheck/Core.hs | 4 +- src/Xrefcheck/Verify.hs | 102 +++++++++++++++--- tests/Test/Xrefcheck/CopyPasteCheckSpec.hs | 65 +++++++++++ tests/Test/Xrefcheck/IgnoreRegexSpec.hs | 2 +- tests/configs/github-config.yaml | 3 + .../check-copy-paste/check-copy-paste.bats | 17 +++ tests/golden/check-copy-paste/expected.gold | 48 +++++++++ tests/golden/check-copy-paste/first-file.md | 11 ++ tests/golden/check-copy-paste/second-file.md | 29 +++++ 12 files changed, 277 insertions(+), 17 deletions(-) create mode 100644 tests/Test/Xrefcheck/CopyPasteCheckSpec.hs create mode 100644 tests/golden/check-copy-paste/check-copy-paste.bats create mode 100644 tests/golden/check-copy-paste/expected.gold create mode 100644 tests/golden/check-copy-paste/first-file.md create mode 100644 tests/golden/check-copy-paste/second-file.md diff --git a/src/Xrefcheck/Command.hs b/src/Xrefcheck/Command.hs index 16620a86..12ee272e 100644 --- a/src/Xrefcheck/Command.hs +++ b/src/Xrefcheck/Command.hs @@ -28,7 +28,7 @@ import Xrefcheck.Scan import Xrefcheck.Scanners.Markdown (markdownSupport) import Xrefcheck.System (askWithinCI) import Xrefcheck.Util -import Xrefcheck.Verify (reportVerifyErrs, verifyErrors, verifyRepo) +import Xrefcheck.Verify (reportCopyPasteErrors, reportVerifyErrs, verifyErrors, verifyRepo) readConfig :: FilePath -> IO Config readConfig path = fmap (normaliseConfigFilePaths . overrideConfig) do @@ -81,11 +81,12 @@ defaultAction Options{..} = do whenJust (nonEmpty $ sortBy (compare `on` seFile) scanErrs) $ reportScanErrs - verifyRes <- allowRewrite showProgressBar $ \rw -> do + (verifyRes, copyPasteErrors) <- allowRewrite showProgressBar $ \rw -> do let fullConfig = config { cNetworking = addNetworkingOptions (cNetworking config) oNetworkingOptions } verifyRepo rw fullConfig oMode oRoot repoInfo + whenJust (nonEmpty copyPasteErrors) reportCopyPasteErrors case verifyErrors verifyRes of Nothing | null scanErrs -> fmtLn "All repository links are valid." Nothing -> exitFailure diff --git a/src/Xrefcheck/Config.hs b/src/Xrefcheck/Config.hs index 1496167b..a0d2ff4b 100644 --- a/src/Xrefcheck/Config.hs +++ b/src/Xrefcheck/Config.hs @@ -73,6 +73,8 @@ data ScannersConfig' f = ScannersConfig , scAnchorSimilarityThreshold :: Field f Double -- ^ On 'anchor not found' error, how much similar anchors should be displayed as -- hint. Number should be between 0 and 1, larger value means stricter filter. + , scCopyPasteCheckEnabled :: Field f Bool + -- ^ Whether copy-paste check is enabled globally. } deriving stock (Generic) makeLensesWith postfixFields ''Config' @@ -94,6 +96,9 @@ overrideConfig config , scAnchorSimilarityThreshold = fromMaybe (scAnchorSimilarityThreshold defScanners) $ scAnchorSimilarityThreshold (cScanners config) + , scCopyPasteCheckEnabled = + fromMaybe (scCopyPasteCheckEnabled defScanners) + $ scCopyPasteCheckEnabled (cScanners config) } } where diff --git a/src/Xrefcheck/Config/Default.hs b/src/Xrefcheck/Config/Default.hs index 72bcd1a2..8550b0bd 100644 --- a/src/Xrefcheck/Config/Default.hs +++ b/src/Xrefcheck/Config/Default.hs @@ -67,6 +67,9 @@ scanners: # # This affects which anchors are generated for headers. flavor: #s{flavor} + + # Whether copy-paste check is enabled globally. + copyPasteCheckEnabled: True |] where ignoreLocalRefsFrom :: NonEmpty Text diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs index 9c3ae450..3863285f 100644 --- a/src/Xrefcheck/Core.hs +++ b/src/Xrefcheck/Core.hs @@ -61,7 +61,7 @@ instance FromJSON Flavor where -- We keep this in text because scanners for different formats use different -- representation of this thing, and it actually appears in reports only. newtype Position = Position (Maybe Text) - deriving stock (Show, Eq, Generic) + deriving stock (Show, Eq, Generic, Ord) instance Given ColorMode => Buildable Position where build (Position pos) = case pos of @@ -77,7 +77,7 @@ data Reference = Reference , rAnchor :: Maybe Text -- ^ Section or custom anchor tag. , rPos :: Position - } deriving stock (Show, Generic) + } deriving stock (Show, Generic, Eq, Ord) -- | Context of anchor. data AnchorType diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs index 1ed1fdbb..b7daca93 100644 --- a/src/Xrefcheck/Verify.hs +++ b/src/Xrefcheck/Verify.hs @@ -26,20 +26,33 @@ module Xrefcheck.Verify , verifyReference , checkExternalResource + -- * Copypaste check + , checkCopyPaste + , CopyPasteCheckResult (..) + -- * URI parsing , parseUri + + -- * Reporting errors , reportVerifyErrs + , reportCopyPasteErrors ) where import Universum import Control.Concurrent.Async (Async, async, cancel, poll, wait, withAsync) import Control.Exception (AsyncException (..), throwIO) +import Control.Exception.Safe (handleAsync, handleJust) import Control.Monad.Except (MonadError (..)) +import Data.Bits (toIntegralSized) import Data.ByteString qualified as BS +import Data.Char (isAlphaNum) +import Data.List (lookup) import Data.List qualified as L import Data.Map qualified as M import Data.Reflection (Given) +import Data.Text (toCaseFold) +import Data.Text qualified as T import Data.Text.Metrics (damerauLevenshteinNorm) import Data.Time (UTCTime, defaultTimeLocale, formatTime, readPTime, rfc822DateFormat) import Data.Time.Clock.POSIX (getPOSIXTime) @@ -66,10 +79,6 @@ import Text.URI (Authority (..), ParseExceptionBs, URI (..), mkURIBs) import Time (RatioNat, Second, Time (..), ms, sec, threadDelay, timeout, (+:+), (-:-)) import URI.ByteString qualified as URIBS -import Control.Exception.Safe (handleAsync, handleJust) -import Data.Bits (toIntegralSized) -import Data.List (lookup) -import Data.Text (toCaseFold) import Xrefcheck.Config import Xrefcheck.Core import Xrefcheck.Orphans () @@ -255,6 +264,21 @@ instance Given ColorMode => Buildable VerifyError where #{redirectedUrl} |] +data CopyPasteCheckResult = CopyPasteCheckResult + { crFile :: FilePath, + crOriginalRef :: Reference, + crCopiedRef :: Reference + } deriving stock (Show, Eq, Ord) + +instance (Given ColorMode) => Buildable CopyPasteCheckResult where + build CopyPasteCheckResult {..} = + [int|| + In file #{styleIfNeeded Faint (styleIfNeeded Bold crFile)} + #{crCopiedRef}\ + is possibly a bad copy paste of + #{crOriginalRef} + |] + reportVerifyErrs :: Given ColorMode => NonEmpty (WithReferenceLoc VerifyError) -> IO () reportVerifyErrs errs = fmt @@ -265,6 +289,17 @@ reportVerifyErrs errs = fmt Invalid references dumped, #{length errs} in total. |] +reportCopyPasteErrors + :: Given ColorMode => NonEmpty CopyPasteCheckResult -> IO () +reportCopyPasteErrors errs = fmt + [int|| + === Possible copy/paste errors === + + #{interpolateIndentF 2 (interpolateBlockListF' "➥ " build errs)} + Possible copy/paste errors dumped, #{length errs} in total. + |] + + data RetryAfter = Date UTCTime | Seconds (Time Second) deriving stock (Show, Eq) @@ -355,7 +390,7 @@ verifyRepo -> VerifyMode -> FilePath -> RepoInfo - -> IO (VerifyResult $ WithReferenceLoc VerifyError) + -> IO (VerifyResult $ WithReferenceLoc VerifyError, [CopyPasteCheckResult]) verifyRepo rw config@Config{..} @@ -363,24 +398,32 @@ verifyRepo root repoInfo'@(RepoInfo files _) = do - let toScan = do - (file, fileInfo) <- M.toList files + + let filesToScan = flip mapMaybe (M.toList files) $ \(file, fileInfo) -> do guard . not $ matchesGlobPatterns root (ecIgnoreRefsFrom cExclusions) file case fileInfo of Scanned fi -> do - ref <- _fiReferences fi - return (file, ref) - NotScannable -> empty -- No support for such file, can do nothing. - NotAddedToGit -> empty -- If this file is scannable, we've notified + Just (file, fi) + NotScannable -> Nothing -- No support for such file, can do nothing. + NotAddedToGit -> Nothing -- If this file is scannable, we've notified -- user that we are scanning only files -- added to Git while gathering RepoInfo. + toCheckCopyPaste = map (second _fiReferences) filesToScan + toScan = concatMap (\(file, fileInfo) -> map (file,) $ _fiReferences fileInfo) filesToScan + copyPasteErrors = if scCopyPasteCheckEnabled cScanners + then [ res + | (file, refs) <- toCheckCopyPaste, + res <- checkCopyPaste file refs + ] + else [] + progressRef <- newIORef $ initVerifyProgress (map snd toScan) accumulated <- loopAsyncUntil (printer progressRef) do forConcurrentlyCaching toScan ifExternalThenCache $ \(file, ref) -> verifyReference config mode progressRef repoInfo' root file ref - case accumulated of + (, copyPasteErrors) <$> case accumulated of Right res -> return $ fold res Left (exception, partialRes) -> do -- The user has hit Ctrl+C; display any verification errors we managed to find and exit. @@ -412,6 +455,41 @@ verifyRepo ExternalLoc -> CacheUnderKey rLink _ -> NoCaching +checkCopyPaste :: FilePath -> [Reference] -> [CopyPasteCheckResult] +checkCopyPaste file refs = do + let getLinkAndAnchor x = (rLink x, rAnchor x) + groupedRefs = + L.groupBy ((==) `on` getLinkAndAnchor) $ + sortBy (compare `on` getLinkAndAnchor) refs + concatMap checkGroup groupedRefs + where + checkGroup :: [Reference] -> [CopyPasteCheckResult] + checkGroup refsInGroup = do + let mergeLinkAndAnchor ref = maybe (rLink ref) (rLink ref <>) $ rAnchor ref + let refsInGroup' = flip map refsInGroup $ \ref -> + (ref, (prepareNameForCheck $ rName ref, + prepareNameForCheck $ mergeLinkAndAnchor ref)) + -- Most of time this will be Nothing and we won't need `others`. + -- The first matching link will be shown as original. + let mbSubstrRef = fst <$> find (textIsLinkSubstr . snd) refsInGroup' + others = fst <$> filter (not . textIsLinkSubstr . snd) refsInGroup' + maybe [] (\substrRef -> map (CopyPasteCheckResult file substrRef) others) mbSubstrRef + + textIsLinkSubstr :: (Text, Text) -> Bool + textIsLinkSubstr (prepName, prepLink) = prepName `isSubSeq` prepLink + + prepareNameForCheck :: Text -> Text + prepareNameForCheck = T.toLower . T.filter isAlphaNum + + isSubSeq :: Text -> Text -> Bool + isSubSeq "" _str = True + isSubSeq _que "" = False + isSubSeq que str + | qhead == shead = isSubSeq qtail stail + | otherwise = isSubSeq que stail + where (qhead, qtail) = T.splitAt 1 que + (shead, stail) = T.splitAt 1 str + shouldCheckLocType :: VerifyMode -> LocationType -> Bool shouldCheckLocType mode locType | isExternal locType = shouldCheckExternal mode diff --git a/tests/Test/Xrefcheck/CopyPasteCheckSpec.hs b/tests/Test/Xrefcheck/CopyPasteCheckSpec.hs new file mode 100644 index 00000000..fbe09ba6 --- /dev/null +++ b/tests/Test/Xrefcheck/CopyPasteCheckSpec.hs @@ -0,0 +1,65 @@ +{- SPDX-FileCopyrightText: 2019 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +module Test.Xrefcheck.CopyPasteCheckSpec where + +import Universum + +import Test.Tasty (TestTree, testGroup) +import Test.Tasty.HUnit (Assertion, testCase, (@?=)) + +import Xrefcheck.Core +import Xrefcheck.Verify + +assertUnordered :: (Show a, Ord a) => [a] -> [a] -> Assertion +assertUnordered = (@?=) `on` sort + +testPath :: FilePath +testPath = "test-path" + +test_copyPasteCheck :: TestTree +test_copyPasteCheck = testGroup "Copypaste check" + [ testCase "Detect copypaste error if there is a link with a matching name" $ do + let link = "./first-file" + anchor = Just "heading" + differentAnchor = Nothing + defPos = Position Nothing + original1 = Reference "_- First - - File" link anchor defPos + original2 = Reference "_- First - fi - le" link anchor defPos + notCopied = Reference " Link 2 " link differentAnchor defPos + copied1 = Reference " foo bar" link anchor defPos + copied2 = Reference " Baz quux" link anchor defPos + input = [original1, original2, notCopied, copied1, copied2] + res = checkCopyPaste testPath input + expectedRes = + -- only first matching link is shown in the output + [ CopyPasteCheckResult testPath original1 copied1 + , CopyPasteCheckResult testPath original1 copied2 + ] + res `assertUnordered` expectedRes + , testCase "Succeed if there is not link with a matching name" $ do + let link = "./first-file" + anchor = Just "heading" + defPos = Position Nothing + original1 = Reference "_Foo bar" link anchor defPos + original2 = Reference " Baz quux" link anchor defPos + original3 = Reference " Foo qubarx" link anchor defPos + input = [original1, original2, original3] + res = checkCopyPaste testPath input + expectedRes = [] + res @?= expectedRes + , testCase "Check external links" $ do + let link = "https://github.com" + anchor = Nothing + defPos = Position Nothing + original = Reference "github" link anchor defPos + copied = Reference "gitlab" link anchor defPos + input = [original, copied] + res = checkCopyPaste testPath input + expectedRes = + [ CopyPasteCheckResult testPath original copied + ] + res @?= expectedRes + ] diff --git a/tests/Test/Xrefcheck/IgnoreRegexSpec.hs b/tests/Test/Xrefcheck/IgnoreRegexSpec.hs index c3143086..31bd1d53 100644 --- a/tests/Test/Xrefcheck/IgnoreRegexSpec.hs +++ b/tests/Test/Xrefcheck/IgnoreRegexSpec.hs @@ -44,7 +44,7 @@ test_ignoreRegex = give WithoutColors $ verifyRes <- allowRewrite showProgressBar $ \rw -> verifyRepo rw config verifyMode root $ srRepoInfo scanResult - let brokenLinks = pickBrokenLinks verifyRes + let brokenLinks = pickBrokenLinks $ fst verifyRes let matchedLinks = [ "https://bad.referenc/" diff --git a/tests/configs/github-config.yaml b/tests/configs/github-config.yaml index 35e3a088..c0698528 100644 --- a/tests/configs/github-config.yaml +++ b/tests/configs/github-config.yaml @@ -56,3 +56,6 @@ scanners: # # This affects which anchors are generated for headers. flavor: GitHub + + # Whether copy-paste check is enabled globally. + copyPasteCheckEnabled: True diff --git a/tests/golden/check-copy-paste/check-copy-paste.bats b/tests/golden/check-copy-paste/check-copy-paste.bats new file mode 100644 index 00000000..45971b8a --- /dev/null +++ b/tests/golden/check-copy-paste/check-copy-paste.bats @@ -0,0 +1,17 @@ +#!/usr/bin/env bats + +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: MPL-2.0 + +load '../helpers/bats-support/load' +load '../helpers/bats-assert/load' +load '../helpers/bats-file/load' +load '../helpers' + + +@test "Check possible copy-paste errors and copy-paste annotations " { + to_temp xrefcheck + + assert_diff expected.gold +} diff --git a/tests/golden/check-copy-paste/expected.gold b/tests/golden/check-copy-paste/expected.gold new file mode 100644 index 00000000..796b4c73 --- /dev/null +++ b/tests/golden/check-copy-paste/expected.gold @@ -0,0 +1,48 @@ +=== Possible copy/paste errors === + + ➥ In file second-file.md + reference (relative) at src:13:1-29: + - text: "Lol Kek" + - link: ./first-file.md + - anchor: - + is possibly a bad copy paste of + reference (relative) at src:7:1-34: + - text: "First file" + - link: ./first-file.md + - anchor: - + + ➥ In file second-file.md + reference (relative) at src:14:1-30: + - text: "Baz quux" + - link: ./first-file.md + - anchor: - + is possibly a bad copy paste of + reference (relative) at src:7:1-34: + - text: "First file" + - link: ./first-file.md + - anchor: - + + ➥ In file second-file.md + reference (relative) at src:24:1-29: + - text: "fdw" + - link: ./first-file.md + - anchor: chor + is possibly a bad copy paste of + reference (relative) at src:23:1-32: + - text: "ff-cho" + - link: ./first-file.md + - anchor: chor + + ➥ In file second-file.md + reference (external) at src:29:1-28: + - text: "gitlab" + - link: https://github.com + - anchor: - + is possibly a bad copy paste of + reference (external) at src:28:1-28: + - text: "github" + - link: https://github.com + - anchor: - + +Possible copy/paste errors dumped, 4 in total. +All repository links are valid. diff --git a/tests/golden/check-copy-paste/first-file.md b/tests/golden/check-copy-paste/first-file.md new file mode 100644 index 00000000..92b3319f --- /dev/null +++ b/tests/golden/check-copy-paste/first-file.md @@ -0,0 +1,11 @@ + + +# heading + +# anch + +# chor diff --git a/tests/golden/check-copy-paste/second-file.md b/tests/golden/check-copy-paste/second-file.md new file mode 100644 index 00000000..02f4d554 --- /dev/null +++ b/tests/golden/check-copy-paste/second-file.md @@ -0,0 +1,29 @@ + + +[ First file ](./first-file.md) + + +[ Link 3](./first-file.md#heading) + + +[ Lol Kek](./first-file.md) +[ Baz quux](./first-file.md) + + + +[ asd](./first-file.md#anch) +[ fdw](./first-file.md#anch) + + + +[ ff-cho](./first-file.md#chor) +[ fdw](./first-file.md#chor) + + + +[github](https://github.com) +[gitlab](https://github.com)