Skip to content

Commit

Permalink
Docx reader: parse index references as empty Spans.
Browse files Browse the repository at this point in the history
See #10171.
  • Loading branch information
jgm committed Dec 5, 2024
1 parent c34edf6 commit d07ada4
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 21 deletions.
1 change: 1 addition & 0 deletions src/Text/Pandoc/Readers/Docx.hs
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,7 @@ parPartToInlines' (OMathPara exps) =
parPartToInlines' (Field info children) =
case info of
HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url children
IndexrefField entry -> pure $ spanWith ("",["indexref"],[("entry",entry)]) mempty
PagerefField fieldAnchor True -> parPartToInlines' $ InternalHyperLink fieldAnchor children
EndNoteCite t -> do
formattedCite <- smushInlines <$> mapM parPartToInlines' children
Expand Down
33 changes: 20 additions & 13 deletions src/Text/Pandoc/Readers/Docx/Fields.hs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ type Anchor = T.Text
data FieldInfo = HyperlinkField URL
-- The boolean indicates whether the field is a hyperlink.
| PagerefField Anchor Bool
| IndexrefField T.Text
| CslCitation T.Text
| CslBibliography
| EndNoteCite T.Text
Expand All @@ -38,26 +39,27 @@ parseFieldInfo :: T.Text -> Either ParseError FieldInfo
parseFieldInfo = parse fieldInfo ""

fieldInfo :: Parser FieldInfo
fieldInfo =
try (HyperlinkField <$> hyperlink)
<|>
try ((uncurry PagerefField) <$> pageref)
<|>
try addIn
<|>
return UnknownField
fieldInfo = do
spaces
(HyperlinkField <$> hyperlink)
<|>
((uncurry PagerefField) <$> pageref)
<|>
(IndexrefField <$> indexref)
<|>
addIn
<|>
return UnknownField

addIn :: Parser FieldInfo
addIn = do
spaces
string "ADDIN"
spaces
try cslCitation <|> cslBibliography <|> endnoteCite <|> endnoteRefList

cslCitation :: Parser FieldInfo
cslCitation = do
optional (string "ZOTERO_ITEM")
spaces
optional (string "ZOTERO_ITEM" *> spaces)
string "CSL_CITATION"
spaces
CslCitation <$> getInput
Expand Down Expand Up @@ -107,7 +109,6 @@ hyperlinkSwitch = do

hyperlink :: Parser URL
hyperlink = do
many space
string "HYPERLINK"
spaces
farg <- option "" $ notFollowedBy (char '\\') *> fieldArgument
Expand All @@ -127,7 +128,6 @@ pagerefSwitch = do

pageref :: Parser (Anchor, Bool)
pageref = do
many space
string "PAGEREF"
spaces
farg <- fieldArgument
Expand All @@ -136,3 +136,10 @@ pageref = do
("\\h", _) : _ -> True
_ -> False
return (farg, isLink)

indexref :: Parser T.Text
indexref = do
string "XE"
spaces
fieldArgument

25 changes: 18 additions & 7 deletions src/Text/Pandoc/Readers/Docx/Parse.hs
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ data ReaderState = ReaderState { stateWarnings :: [T.Text]
deriving Show

data FldCharState = FldCharOpen
| FldCharFieldInfo FieldInfo
| FldCharFieldInfo T.Text
| FldCharContent FieldInfo [ParPart]
deriving (Show)

Expand Down Expand Up @@ -941,6 +941,10 @@ example (omissions and my comments in brackets):
<w:fldChar w:fldCharType="end"/>
</w:r>
Note that there may be mulitple w:instrText elements in a row.
For example, you might first have ` XE "`, then `Kay, Alan`, then `"`.
The texts in all of them should be concatenated before it is processed!
So we do this in a number of steps. If we encounter the fldchar begin
tag, we start open a fldchar state variable (see state above). We add
the instrtext to it as FieldInfo. Then we close that and start adding
Expand All @@ -961,13 +965,15 @@ elemToParPart ns element
_ | fldCharType == "begin" -> do
modify $ \st -> st {stateFldCharState = FldCharOpen : fldCharState}
return []
FldCharFieldInfo info : ancestors | fldCharType == "separate" -> do
FldCharFieldInfo t : ancestors | fldCharType == "separate" -> do
info <- eitherToD $ parseFieldInfo t
modify $ \st -> st {stateFldCharState = FldCharContent info [] : ancestors}
return []
-- Some fields have no content, since Pandoc doesn't understand any of those fields, we can just close it.
FldCharFieldInfo _ : ancestors | fldCharType == "end" -> do
-- Some fields have no content, e.g. index XE:
FldCharFieldInfo t : ancestors | fldCharType == "end" -> do
modify $ \st -> st {stateFldCharState = ancestors}
return []
info <- eitherToD $ parseFieldInfo t
return [Field info []]
[FldCharContent info children] | fldCharType == "end" -> do
modify $ \st -> st {stateFldCharState = []}
return [Field info $ reverse children]
Expand All @@ -982,8 +988,13 @@ elemToParPart ns element
fldCharState <- gets stateFldCharState
case fldCharState of
FldCharOpen : ancestors -> do
info <- eitherToD $ parseFieldInfo $ strContent instrText
modify $ \st -> st {stateFldCharState = FldCharFieldInfo info : ancestors}
modify $ \st -> st {stateFldCharState =
FldCharFieldInfo (strContent instrText) : ancestors}
return []
FldCharFieldInfo t : ancestors -> do
modify $ \st -> st {stateFldCharState =
FldCharFieldInfo (t <> strContent instrText) :
ancestors}
return []
_ -> return []
{-
Expand Down
2 changes: 1 addition & 1 deletion test/docx/empty_field.native
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[Para
[Str "\24076\26395\28145\20837\20102\35299\30340\35835\32773\21487\20197\21435\30475David",Space,Str "French",Space,Str "Belding\21644Kevin",Space,Str "J.",Space,Str "Mitchell\30340"
[Str "\24076\26395\28145\20837\20102\35299\30340\35835\32773\21487\20197\21435\30475David",Space,Str "French",Span ( "" , [ "indexref" ] , [ ( "entry" , "French" ) ] ) [],Space,Str "Belding\21644Kevin",Space,Str "J.",Space,Str "Mitchell\30340"
,Link ("",[],[]) [Str "Foundations",Space,Str "of",Space,Str "Analysis,",Space,Str "2nd",Space,Str "Edition"] ("https://books.google.com/books?id=sp_Zcb9ot90C&lpg=PR4&hl=zh-CN&pg=PA19#v=onepage&q&f=true",""),Str ",\21487\20174\&19\39029\30475\36215\65292\25110D.C.",Space,Str "Goldrei\30340",Space
,Link ("",[],[]) [Str "Classic",Space,Str "Set",Space,Str "Theory:",Space,Str "For",Space,Str "Guided",Space,Str "Independent",Space,Str "Study"] ("https://books.google.ae/books?id=dlc0DwAAQBAJ&lpg=PT29&hl=zh-CN&pg=PT26#v=onepage&q&f=true","")
,Str "\65292\20174\31532\20108\31456\30475\36215\65292\38405\35835\26102\35201\27880\24847\26412\25991\19982\36825\20123\20070\25152\19981\21516\30340\26159\24182\27809\26377\25226\23454\25968\30475\20316\26159\26377\29702\25968\38598\30340\20998\21106\12290"]
Expand Down

0 comments on commit d07ada4

Please sign in to comment.