Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[#93] Add support for basic html tags #259

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .xrefcheck.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ exclusions:
ignore:
- tests/markdowns/**/*
- tests/golden/**/*
- docs/output-sample/**/*
- docs/output-sample/**/*.md

scanners:
markdown:
Expand Down
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ Unreleased
* [#231](https://github.com/serokell/xrefcheck/pull/231)
+ Anchor analysis takes now into account the appropriate case-sensitivity depending on
the configured Markdown flavour.
* [#259](https://github.com/serokell/xrefcheck/pull/259)
+ Add support for image tags `<img src="link">`, anchor hyperlinks `<a href="link">Text</a>`
and anchor target locations `<a name="loc">` or `<a id="loc">`.

0.2.2
==========
Expand Down
130 changes: 95 additions & 35 deletions src/Xrefcheck/Scanners/Markdown.hs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import Universum
import CMarkGFM
(Node (..), NodeType (..), PosInfo (..), commonmarkToNode, extAutolink, optFootnotes)
import Control.Lens (_Just, makeLenses, makeLensesFor, (.=))
import Control.Monad.Trans.RWS.CPS qualified as RWS
import Control.Monad.Trans.Writer.CPS (Writer, runWriter, tell)
import Data.Aeson (FromJSON (..), genericParseJSON)
import Data.ByteString.Lazy qualified as BSL
Expand Down Expand Up @@ -195,6 +196,12 @@ removeIgnored = withIgnoreMode . cataNodeWithParentNodeInfo remove
(IMSLink _, IMAGE {}) -> do
ssIgnore .= Nothing
return defNode
(IMSLink _, HTML_INLINE text) | isLink text -> do
ssIgnore .= Nothing
pure defNode
(IMSLink _, HTML_BLOCK text) | isLink text -> do
ssIgnore .= Nothing
pure defNode
(IMSLink ignoreLinkState, _) -> do
when (ignoreLinkState == ExpectingLinkInSubnodes) $
ssIgnore . _Just . ignoreMode .= IMSLink ParentExpectsLink
Expand Down Expand Up @@ -264,57 +271,59 @@ removeIgnored = withIgnoreMode . cataNodeWithParentNodeInfo remove
pure node
(node, _) -> pure node

findAttributes :: [Text] -> [Attribute Text] -> Maybe Text
findAttributes (map T.toLower -> attrs) =
fmap snd . find ((`elem` attrs) . T.toLower . fst)

isLink :: Text -> Bool
isLink (parseTags -> tags) = case safeHead tags of
Just (TagOpen tag attrs) ->
T.toLower tag == "a" && isJust (findAttributes ["href"] attrs)
|| T.toLower tag == "img" && isJust (findAttributes ["src"] attrs)
_ -> False

aeqz marked this conversation as resolved.
Show resolved Hide resolved
-- | Custom `foldMap` for source tree.
foldNode :: (Monoid a, Monad m) => (Node -> m a) -> Node -> m a
foldNode action node@(Node _ _ subs) = do
a <- action node
b <- concatForM subs (foldNode action)
return (a <> b)

type ExtractorM a = ReaderT MarkdownConfig (Writer [ScanError 'Parse]) a
type ExtractorM a = RWS.RWS MarkdownConfig [ScanError 'Parse] (Maybe Reference) a

-- | Extract information from source tree.
nodeExtractInfo :: Node -> ExtractorM FileInfo
nodeExtractInfo input@(Node _ _ nSubs) = do
if checkIgnoreAllFile nSubs
then return (diffToFileInfo mempty)
else diffToFileInfo <$> (foldNode extractor =<< lift (removeIgnored input))
else diffToFileInfo <$> (foldNode extractor =<< (RWS.writer . runWriter $ removeIgnored input))

where
extractor :: Node -> ExtractorM FileInfoDiff
extractor node@(Node pos ty _) =
case ty of
HTML_BLOCK _ -> do
return mempty
extractor node@(Node pos ty _) = do
reference' <- RWS.get
-- If current state is not `Nothing`, try extracting associated text
let fileInfoDiff = case (reference', ty) of
(Just ref, TEXT text) ->
mempty & fidReferences .~ DList.singleton ref {rName = text}
(Just ref, _) -> mempty & fidReferences .~ DList.singleton ref
_ -> mempty
RWS.put Nothing
fmap (fileInfoDiff <>) case ty of
HTML_BLOCK text | isLink text -> extractHtmlLink text

HTML_BLOCK text -> extractAnchor text

HEADING lvl -> do
flavor <- asks mcFlavor
flavor <- RWS.asks mcFlavor
let aType = HeaderAnchor lvl
let aName = headerToAnchor flavor $ nodeExtractText node
let aPos = toPosition pos
return $ FileInfoDiff DList.empty $ DList.singleton $ Anchor {aType, aName, aPos}

HTML_INLINE text -> do
let
mName = do
tag <- safeHead $ parseTags text
attributes <- case tag of
TagOpen a attrs
| T.toLower a == "a" -> Just attrs
_ -> Nothing
(_, name) <- find (\(field, _) -> T.toLower field `elem` ["name", "id"]) attributes
pure name

case mName of
Just aName -> do
let aType = HandAnchor
aPos = toPosition pos
return $ FileInfoDiff
mempty
(pure $ Anchor {aType, aName, aPos})
HTML_INLINE text | isLink text -> extractHtmlLink text

Nothing -> do
return mempty
HTML_INLINE text -> extractAnchor text

LINK url _ -> extractLink url

Expand All @@ -328,17 +337,69 @@ nodeExtractInfo input@(Node _ _ nSubs) = do
rPos = toPosition pos
link = if null url then rName else url

let (rLink, rAnchor) = case T.splitOn "#" link of
[t] -> (t, Nothing)
t : ts -> (t, Just $ T.intercalate "#" ts)
[] -> error "impossible"
let (rLink, rAnchor) = splitLink link

let rInfo = referenceInfo rLink

return $ FileInfoDiff
(DList.singleton $ Reference {rName, rPos, rLink, rAnchor, rInfo})
DList.empty

extractAnchor :: Text -> ExtractorM FileInfoDiff
extractAnchor text = do
let mName = do
tag <- safeHead $ parseTags text
attributes <- case tag of
TagOpen a attrs | T.toLower a == "a" -> Just attrs
_ -> Nothing
findAttributes ["name", "id"] attributes

case mName of
Just aName -> do
let aType = HandAnchor
aPos = toPosition pos
return $ FileInfoDiff
mempty
(pure $ Anchor {aType, aName, aPos})

Nothing -> do
return mempty

extractHtmlReference :: [Attribute Text] -> Maybe PosInfo -> DList.DList Reference
extractHtmlReference attrs tagPos = fromMaybe mempty do
link <- findAttributes ["href"] attrs
let (rLink, rAnchor) = splitLink link
pure . DList.singleton $ Reference "" rLink rAnchor (toPosition tagPos) (referenceInfo rLink)

splitLink :: Text -> (Text, Maybe Text)
splitLink link = case T.splitOn "#" link of
[t] -> (t, Nothing)
t : ts -> (t, Just $ T.intercalate "#" ts)
[] -> error "impossible"

extractHtmlImage :: [Attribute Text] -> Maybe PosInfo -> DList.DList Reference
extractHtmlImage attrs tagPos = fromMaybe mempty do
link <- findAttributes ["src"] attrs
pure . DList.singleton $ Reference "" link Nothing (toPosition tagPos) (referenceInfo link)

extractHtmlLink :: Text -> ExtractorM FileInfoDiff
extractHtmlLink text =
case safeHead $ parseTags text of
Just (TagOpen tag attrs) | T.toLower tag == "img" ->
pure $ mempty & fidReferences .~ extractHtmlImage attrs pos
Just (TagOpen tag attrs) | T.toLower tag == "a" -> do
let reference = extractHtmlReference attrs pos
case DList.toList reference of
[ref] -> do
-- The `cmark-gfm` package parses the link tag as three separate nodes:
-- `HTML_INLINE` with an opening tag, a `TEXT` with text in between,
-- and `HTML_INLINE` with a closing tag. So we keep the extracted link in a state and
-- try to get associated text in the next node.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we have been discussing in recent PRs, this code seems to be more difficult to understand with each new feature that requires to modify it. At least, comments and separating chunks of code to functions help in clarifying it for the moment 👍

RWS.put $ Just ref
pure mempty
_ -> pure mempty
_ -> pure mempty

aeqz marked this conversation as resolved.
Show resolved Hide resolved
-- | Check if there is `ignore all` at the beginning of the file,
-- ignoring preceding comments if there are any.
checkIgnoreAllFile :: [Node] -> Bool
Expand Down Expand Up @@ -406,11 +467,10 @@ textToMode _ = NotAnAnnotation

parseFileInfo :: MarkdownConfig -> LT.Text -> (FileInfo, [ScanError 'Parse])
parseFileInfo config input
= runWriter
$ flip runReaderT config
$ nodeExtractInfo
= RWS.evalRWS
(nodeExtractInfo
$ commonmarkToNode [optFootnotes] [extAutolink]
$ toStrict input
$ toStrict input) config Nothing

markdownScanner :: MarkdownConfig -> ScanAction
markdownScanner config canonicalFile =
Expand Down
30 changes: 30 additions & 0 deletions tests/golden/check-html/check-html.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,38 @@

## <a name='one'> <a name=two> <a NAME="three"> <a name="four"></a> <a NAME=five > Title1

<a name=six>

text <a id=seven> text

[One](#one)
[Two](#two)
[Three](#three)
[Four](#four)
[Five](#five)
[Six](#six)
[Seven](#seven)

<img src="https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png" alt="Output sample" width="600"/>

text <img src="https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png" alt="Output sample" width="600"/> text

<a href=https://serokell.io/>serokell</a>

text <a href=https://serokell.io/>serokell</a> text

<a href=#six>Six</a>

text <a href=#seven>Seven</a> text

<!-- xrefcheck: ignore link -->
<a href=https://serokell.io/404>serokell404</a>

<!-- xrefcheck: ignore link -->
aeqz marked this conversation as resolved.
Show resolved Hide resolved
text <a href=https://serokell.io/404>serokell404</a> text

<!-- xrefcheck: ignore link -->
<img src=https://serokell.io/404>

<!-- xrefcheck: ignore link -->
text <img src=https://serokell.io/404> text
4 changes: 2 additions & 2 deletions tests/golden/check-html/check.html.bats
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ load '../helpers'


@test "All HTML anchors should be valid" {
run xrefcheck
to_temp xrefcheck -v

assert_output --partial "All repository links are valid."
assert_diff expected.gold
}
67 changes: 67 additions & 0 deletions tests/golden/check-html/expected.gold
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
=== Repository data ===

check-html.md:
- references:
- reference (file-local) at src:13:1-11:
- text: "One"
- link: -
- anchor: one
- reference (file-local) at src:14:1-11:
- text: "Two"
- link: -
- anchor: two
- reference (file-local) at src:15:1-15:
- text: "Three"
- link: -
- anchor: three
- reference (file-local) at src:16:1-13:
- text: "Four"
- link: -
- anchor: four
- reference (file-local) at src:17:1-13:
- text: "Five"
- link: -
- anchor: five
- reference (file-local) at src:18:1-11:
- text: "Six"
- link: -
- anchor: six
- reference (file-local) at src:19:1-15:
- text: "Seven"
- link: -
- anchor: seven
- reference (external) at src:21:1-144:
- text: ""
- link: https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png
- anchor: -
- reference (external) at src:23:6-149:
- text: ""
- link: https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png
- anchor: -
- reference (external) at src:25:1-29:
- text: "serokell"
- link: https://serokell.io/
- anchor: -
- reference (external) at src:27:6-34:
- text: "serokell"
- link: https://serokell.io/
- anchor: -
- reference (file-local) at src:29:1-13:
- text: "Six"
- link: -
- anchor: six
- reference (file-local) at src:31:6-20:
- text: "Seven"
- link: -
- anchor: seven
- anchors:
- title1 (header II) at src:7:1-96
- one (hand made) at src:7:4-17
- two (hand made) at src:7:19-30
- three (hand made) at src:7:32-47
- four (hand made) at src:7:49-63
- five (hand made) at src:7:69-88
- six (hand made) at src:9:1-12
- seven (hand made) at src:11:6-17

All repository links are valid.