Skip to content

Commit

Permalink
[#64] Implement copy-paste check, add golden tests, add new scan errors
Browse files Browse the repository at this point in the history
  • Loading branch information
YuriRomanowski committed Dec 14, 2022
1 parent 6abda05 commit ea1050b
Show file tree
Hide file tree
Showing 14 changed files with 325 additions and 31 deletions.
5 changes: 3 additions & 2 deletions src/Xrefcheck/Command.hs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import Xrefcheck.Scan
import Xrefcheck.Scanners.Markdown (markdownSupport)
import Xrefcheck.System (askWithinCI)
import Xrefcheck.Util
import Xrefcheck.Verify (reportVerifyErrs, verifyErrors, verifyRepo)
import Xrefcheck.Verify (reportCopyPasteErrors, reportVerifyErrs, verifyErrors, verifyRepo)

readConfig :: FilePath -> IO Config
readConfig path = fmap (normaliseConfigFilePaths . overrideConfig) do
Expand Down Expand Up @@ -81,11 +81,12 @@ defaultAction Options{..} = do

whenJust (nonEmpty $ sortBy (compare `on` seFile) scanErrs) $ reportScanErrs

verifyRes <- allowRewrite showProgressBar $ \rw -> do
(verifyRes, copyPasteErrors) <- allowRewrite showProgressBar $ \rw -> do
let fullConfig = config
{ cNetworking = addNetworkingOptions (cNetworking config) oNetworkingOptions }
verifyRepo rw fullConfig oMode oRoot repoInfo

whenJust (nonEmpty copyPasteErrors) reportCopyPasteErrors
case verifyErrors verifyRes of
Nothing | null scanErrs -> fmtLn "All repository links are valid."
Nothing -> exitFailure
Expand Down
12 changes: 8 additions & 4 deletions src/Xrefcheck/Core.hs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ module Xrefcheck.Core where
import Universum

import Control.Lens (makeLenses)
import Control.Lens.Combinators (makeLensesWith)
import Data.Aeson (FromJSON (..), withText)
import Data.Char (isAlphaNum)
import Data.Char qualified as C
Expand Down Expand Up @@ -70,14 +71,17 @@ instance Given ColorMode => Buildable Position where

-- | Full info about a reference.
data Reference = Reference
{ rName :: Text
{ rName :: Text
-- ^ Text displayed as reference.
, rLink :: Text
, rLink :: Text
-- ^ File or site reference points to.
, rAnchor :: Maybe Text
, rAnchor :: Maybe Text
-- ^ Section or custom anchor tag.
, rPos :: Position
, rPos :: Position
-- ^ Whether to check bad copy/paste for this link
, rCheckCopyPaste :: Bool
} deriving stock (Show, Generic)
makeLensesWith postfixFields ''Reference

-- | Context of anchor.
data AnchorType
Expand Down
13 changes: 11 additions & 2 deletions src/Xrefcheck/Scan.hs
Original file line number Diff line number Diff line change
Expand Up @@ -117,18 +117,27 @@ data ScanErrorDescription
= LinkErr
| FileErr
| ParagraphErr Text
| LinkErrCpc
| FileErrCpc
| ParagraphErrCpc Text
| UnrecognisedErr Text
deriving stock (Show, Eq)

instance Buildable ScanErrorDescription where
build = \case
LinkErr -> [int||Expected a LINK after "ignore link" annotation|]
LinkErrCpc -> [int||Expected a LINK after "no duplication check in link" annotation|]
FileErr -> [int||Annotation "ignore all" must be at the top of \
markdown or right after comments at the top|]
FileErrCpc -> [int||Annotation "no duplication check in file" must be at the top of \
markdown or right after comments at the top|]
ParagraphErr txt -> [int||Expected a PARAGRAPH after \
"ignore paragraph" annotation, but found #{txt}|]
UnrecognisedErr txt -> [int||Unrecognised option "#{txt}" perhaps you meant \
<"ignore link"|"ignore paragraph"|"ignore all">|]
ParagraphErrCpc txt -> [int||Expected a PARAGRAPH after \
"no duplication check in paragraph" annotation, but found #{txt}|]
UnrecognisedErr txt -> [int||Unrecognised option "#{txt}", perhaps you meant
<"ignore link"|"ignore paragraph"|"ignore all">
or "no duplication check in <link|paragraph|file>"?|]

specificFormatsSupport :: [([Extension], ScanAction)] -> FormatsSupport
specificFormatsSupport formats = \ext -> M.lookup ext formatsMap
Expand Down
19 changes: 10 additions & 9 deletions src/Xrefcheck/Scanners/Markdown.hs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ instance Buildable (Node a) where
type NodeCPC = Node CopyPasteCheck

newtype CopyPasteCheck = CopyPasteCheck
{ shouldCheck :: Bool
{ cpcShouldCheck :: Bool
} deriving stock (Show, Eq, Generic)

toPosition :: Maybe PosInfo -> Position
Expand Down Expand Up @@ -267,7 +267,7 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen
_ -> pass
use ssIgnoreCopyPasteCheck >>= \case
Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) -> do
lift $ tell $ makeError pragmaPos fp LinkErr -- TODO: different error type
lift $ tell $ makeError pragmaPos fp LinkErrCpc
ssIgnoreCopyPasteCheck .= Nothing
_ -> pass

Expand Down Expand Up @@ -315,7 +315,7 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen
use ssIgnoreCopyPasteCheck >>= \case
Just (Ignore IMSParagraph modePos) ->
whenM (use ssParagraphExpectedAfterCpcAnnotation) $ do
reportExpectedParagraphAfterIgnoreAnnotation modePos ty
lift . tell . makeError modePos fp . ParagraphErrCpc $ prettyType ty
ssParagraphExpectedAfterCpcAnnotation .= False
ssIgnoreCopyPasteCheck .= Nothing
_ -> pass
Expand Down Expand Up @@ -351,7 +351,7 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen
currentIgnore <- use ssIgnoreCopyPasteCheck
case currentIgnore of
Just (Ignore {_ignoreMode = IMSLink ParentExpectsLink}) -> do
lift $ tell $ makeError modePos fp LinkErr -- TODO: different error type
lift $ tell $ makeError modePos fp LinkErrCpc
ssIgnoreCopyPasteCheck .= Nothing
_ -> pass
return node'
Expand Down Expand Up @@ -410,7 +410,7 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen
-- any correct annotations should be handled in `checkGlobalAnnotations`
-- function.
IMAll -> do
lift . tell $ makeError correctPos fp FileErr -- TODO: different error type
lift . tell $ makeError correctPos fp FileErrCpc
pure Nothing

whenJust mbIgnoreModeState $ \ignoreModeState -> do
Expand All @@ -419,11 +419,11 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen
Nothing -> setupNewCpcState
Just (Ignore curIgn prevPos)
| IMSLink _ <- curIgn -> do
lift $ tell $ makeError prevPos fp LinkErr -- TODO: different error type
lift $ tell $ makeError prevPos fp LinkErrCpc
setupNewCpcState
| IMSParagraph <- curIgn -> case ignoreModeState of
IMSParagraph -> do
lift . tell . makeError prevPos fp . ParagraphErr $ prettyType nodeType -- TODO: different error type
lift . tell . makeError prevPos fp . ParagraphErrCpc $ prettyType nodeType
setupNewCpcState
-- It's OK to have link annotation when paragraph is ignored
-- because in this case all links and all annotations are ignored.
Expand Down Expand Up @@ -482,7 +482,7 @@ nodeExtractInfo fp (C.Node nPos nTy nSubs) = do

where
extractor :: NodeCPC -> ExtractorM FileInfoDiff
extractor node@(Node pos ty _info _) =
extractor node@(Node pos ty info _) =
case ty of
HTML_BLOCK _ -> do
return mempty
Expand Down Expand Up @@ -532,7 +532,8 @@ nodeExtractInfo fp (C.Node nPos nTy nSubs) = do
t : ts -> (t, Just $ T.intercalate "#" ts)
[] -> error "impossible"
return $ FileInfoDiff
(DList.singleton $ Reference {rName, rPos, rLink, rAnchor})
(DList.singleton $
Reference {rName, rPos, rLink, rAnchor, rCheckCopyPaste = cpcShouldCheck info})
DList.empty

-- | Check for global annotations, ignoring simple comments if there are any.
Expand Down
87 changes: 79 additions & 8 deletions src/Xrefcheck/Verify.hs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ module Xrefcheck.Verify

-- * URI parsing
, parseUri

-- * Reporting errors
, reportVerifyErrs
, reportCopyPasteErrors
) where

import Universum
Expand All @@ -37,9 +40,11 @@ import Control.Concurrent.Async (Async, async, cancel, poll, wait, withAsync)
import Control.Exception (AsyncException (..), throwIO)
import Control.Monad.Except (MonadError (..))
import Data.ByteString qualified as BS
import Data.Char (isAlphaNum)
import Data.List qualified as L
import Data.Map qualified as M
import Data.Reflection (Given)
import Data.Text qualified as T
import Data.Text.Metrics (damerauLevenshteinNorm)
import Data.Time (UTCTime, defaultTimeLocale, formatTime, readPTime, rfc822DateFormat)
import Data.Time.Clock.POSIX (getPOSIXTime)
Expand Down Expand Up @@ -255,6 +260,21 @@ instance Given ColorMode => Buildable VerifyError where
#{redirectedUrl}
|]

data CopyPasteCheckResult = CopyPasteCheckResult
{ crFile :: FilePath,
crOriginalRef :: Reference,
crCopiedRef :: Reference
}

instance (Given ColorMode) => Buildable CopyPasteCheckResult where
build CopyPasteCheckResult {..} =
[int||
In file #{styleIfNeeded Faint (styleIfNeeded Bold crFile)}
#{crCopiedRef}\
is possibly a bad copy paste of
#{crOriginalRef}
|]

reportVerifyErrs
:: Given ColorMode => NonEmpty (WithReferenceLoc VerifyError) -> IO ()
reportVerifyErrs errs = fmt
Expand All @@ -265,6 +285,17 @@ reportVerifyErrs errs = fmt
Invalid references dumped, #{length errs} in total.
|]

reportCopyPasteErrors
:: Given ColorMode => NonEmpty CopyPasteCheckResult -> IO ()
reportCopyPasteErrors errs = fmt
[int||
=== Possible copy/paste errors ===

#{interpolateIndentF 2 (interpolateBlockListF' "➥ " build errs)}
Possible copy/paste errors dumped, #{length errs} in total.
|]



data RetryAfter = Date UTCTime | Seconds (Time Second)
deriving stock (Show, Eq)
Expand Down Expand Up @@ -355,32 +386,37 @@ verifyRepo
-> VerifyMode
-> FilePath
-> RepoInfo
-> IO (VerifyResult $ WithReferenceLoc VerifyError)
-> IO (VerifyResult $ WithReferenceLoc VerifyError, [CopyPasteCheckResult])
verifyRepo
rw
config@Config{..}
mode
root
repoInfo'@(RepoInfo files _)
= do
let toScan = do
(file, fileInfo) <- M.toList files

let filesToScan = flip mapMaybe (M.toList files) $ \(file, fileInfo) -> do
guard . not $ matchesGlobPatterns root (ecIgnoreRefsFrom cExclusions) file
case fileInfo of
Scanned fi -> do
ref <- _fiReferences fi
return (file, ref)
NotScannable -> empty -- No support for such file, can do nothing.
NotAddedToGit -> empty -- If this file is scannable, we've notified
Just (file, _fiReferences fi)
NotScannable -> Nothing -- No support for such file, can do nothing.
NotAddedToGit -> Nothing -- If this file is scannable, we've notified
-- user that we are scanning only files
-- added to Git while gathering RepoInfo.

shouldCheckCopyPaste _ = True
toCheckCopyPaste = filter (\(file, _refs) -> shouldCheckCopyPaste file) filesToScan
toScan = concatMap (\(file, refs) -> map (file,) refs) filesToScan
copyPasteErrors = [ res
| (file, refs) <- toCheckCopyPaste, res <- checkCopyPaste file refs]

progressRef <- newIORef $ initVerifyProgress (map snd toScan)

accumulated <- loopAsyncUntil (printer progressRef) do
forConcurrentlyCaching toScan ifExternalThenCache $ \(file, ref) ->
verifyReference config mode progressRef repoInfo' root file ref
case accumulated of
(, copyPasteErrors) <$> case accumulated of
Right res -> return $ fold res
Left (exception, partialRes) -> do
-- The user has hit Ctrl+C; display any verification errors we managed to find and exit.
Expand Down Expand Up @@ -412,6 +448,41 @@ verifyRepo
ExternalLoc -> CacheUnderKey rLink
_ -> NoCaching

checkCopyPaste :: FilePath -> [Reference] -> [CopyPasteCheckResult]
checkCopyPaste file refs = do
let groupedRefs =
L.groupBy ((==) `on` rLink) $
sortBy (compare `on` rLink) $
filter rCheckCopyPaste refs
concatMap checkGroup groupedRefs
where
checkGroup :: [Reference] -> [CopyPasteCheckResult]
checkGroup refsInGroup = do
let refsInGroup' = flip map refsInGroup $ \ref ->
(ref, (prepareRefName (rName ref), prepareRefLink (rLink ref)))
let mbSubstrRef = fst <$> find (textIsLinkSubstr . snd) refsInGroup'
others = fst <$> filter (not . textIsLinkSubstr . snd) refsInGroup'
maybe [] (\substrRef -> map (CopyPasteCheckResult file substrRef) others) mbSubstrRef

textIsLinkSubstr :: (Text, Text) -> Bool
textIsLinkSubstr (prepName, prepLink) = prepName `isSubSeq` prepLink


prepareRefName :: Text -> Text
prepareRefName = T.toLower . T.filter isAlphaNum

prepareRefLink :: Text -> Text
prepareRefLink = T.toLower

isSubSeq :: Text -> Text -> Bool
isSubSeq "" _str = True
isSubSeq _que "" = False
isSubSeq que str
| qhead == shead = isSubSeq qtail stail
| otherwise = isSubSeq que stail
where (qhead, qtail) = T.splitAt 1 que
(shead, stail) = T.splitAt 1 str

shouldCheckLocType :: VerifyMode -> LocationType -> Bool
shouldCheckLocType mode locType
| isExternal locType = shouldCheckExternal mode
Expand Down
2 changes: 1 addition & 1 deletion tests/Test/Xrefcheck/IgnoreRegexSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ test_ignoreRegex = give WithoutColors $
verifyRes <- allowRewrite showProgressBar $ \rw ->
verifyRepo rw config verifyMode root $ srRepoInfo scanResult

let brokenLinks = pickBrokenLinks verifyRes
let brokenLinks = pickBrokenLinks $ fst verifyRes

let matchedLinks =
[ "https://bad.referenc/"
Expand Down
6 changes: 3 additions & 3 deletions tests/Test/Xrefcheck/TooManyRequestsSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ test_tooManyRequests = testGroup "429 response tests"
}
}
_ <- verifyReferenceWithProgress
(Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing))
(Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing) False)
progressRef
Progress{..} <- vrExternal <$> readIORef progressRef
let ttc = ttTimeToCompletion <$> pTaskTimestamp
Expand All @@ -88,7 +88,7 @@ test_tooManyRequests = testGroup "429 response tests"
}
}
_ <- verifyReferenceWithProgress
(Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing))
(Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing) False)
progressRef
Progress{..} <- vrExternal <$> readIORef progressRef
let ttc = fromMaybe (sec 0) $ ttTimeToCompletion <$> pTaskTimestamp
Expand All @@ -114,7 +114,7 @@ test_tooManyRequests = testGroup "429 response tests"
}
}
_ <- verifyReferenceWithProgress
(Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing))
(Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing) False)
progressRef
Progress{..} <- vrExternal <$> readIORef progressRef
let ttc = ttTimeToCompletion <$> pTaskTimestamp
Expand Down
2 changes: 1 addition & 1 deletion tests/Test/Xrefcheck/UtilRequests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ checkLinkAndProgressWithServer mock link progress vrExpectation =

verifyLink :: Text -> IO (VerifyResult VerifyError, Progress Int)
verifyLink link = do
let reference = Reference "" link Nothing (Position Nothing)
let reference = Reference "" link Nothing (Position Nothing) False
progRef <- newIORef $ initVerifyProgress [reference]
result <- verifyReferenceWithProgress reference progRef
p <- readIORef progRef
Expand Down
17 changes: 17 additions & 0 deletions tests/golden/check-copy-paste/check-copy-paste.bats
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env bats

# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: MPL-2.0

load '../helpers/bats-support/load'
load '../helpers/bats-assert/load'
load '../helpers/bats-file/load'
load '../helpers'


@test "Check possible copy-paste errors and copy-paste annotations " {
to_temp xrefcheck

assert_diff expected.gold
}
Loading

0 comments on commit ea1050b

Please sign in to comment.