Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update HtmlAgilityPack, Pester Tests, refactor #11

Merged
merged 11 commits into from
May 22, 2024
56 changes: 28 additions & 28 deletions PSModule.build.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -105,33 +105,33 @@ Enter-Build {
Get-Variable | select-object name, value, visibility | format-table -autosize | out-string | write-verbose
}

#Register Nuget
if (!(get-packageprovider "Nuget" -ForceBootstrap -ErrorAction silentlycontinue)) {
write-verbose "Nuget Provider Not found. Fetching..."
Install-PackageProvider Nuget -forcebootstrap -scope currentuser @PassThruParams | out-string | write-verbose
write-verboseheader "Installed Nuget Provider Info"
Get-PackageProvider Nuget @PassThruParams | format-list | out-string | write-verbose
}

#Fix a bug with the Appveyor 2017 image having a broken nuget (points to v3 URL but installed packagemanagement doesn't query v3 correctly)
#Next command will add this back
if ($ENV:APPVEYOR -and ($ENV:APPVEYOR_BUILD_WORKER_IMAGE -eq 'Visual Studio 2017')) {
write-verbose "Detected Appveyor VS2017 Image, using v2 Nuget API"
UnRegister-PackageSource -Name nuget.org
}

#Add the nuget repository so we can download things like GitVersion
if (!(Get-PackageSource "nuget.org" -erroraction silentlycontinue)) {
write-verbose "Registering nuget.org as package source"
Register-PackageSource -provider NuGet -name nuget.org -location http://www.nuget.org/api/v2 -Trusted @PassThruParams | out-string | write-verbose
}
else {
$nugetOrgPackageSource = Set-PackageSource -name 'nuget.org' -Trusted @PassThruParams
if ($PassThruParams.Verbose) {
write-verboseheader "Nuget.Org Package Source Info "
$nugetOrgPackageSource | format-table | out-string | write-verbose
}
}
# #Register Nuget
# if (!(get-packageprovider "Nuget" -ForceBootstrap -ErrorAction silentlycontinue)) {
# write-verbose "Nuget Provider Not found. Fetching..."
# Install-PackageProvider Nuget -forcebootstrap -scope currentuser @PassThruParams | out-string | write-verbose
# write-verboseheader "Installed Nuget Provider Info"
# Get-PackageProvider Nuget @PassThruParams | format-list | out-string | write-verbose
# }

# #Fix a bug with the Appveyor 2017 image having a broken nuget (points to v3 URL but installed packagemanagement doesn't query v3 correctly)
# #Next command will add this back
# if ($ENV:APPVEYOR -and ($ENV:APPVEYOR_BUILD_WORKER_IMAGE -eq 'Visual Studio 2017')) {
# write-verbose "Detected Appveyor VS2017 Image, using v2 Nuget API"
# UnRegister-PackageSource -Name nuget.org
# }

# #Add the nuget repository so we can download things like GitVersion
# if (!(Get-PackageSource "nuget.org" -erroraction silentlycontinue)) {
# write-verbose "Registering nuget.org as package source"
# Register-PackageSource -provider NuGet -name nuget.org -location http://www.nuget.org/api/v2 -Trusted @PassThruParams | out-string | write-verbose
# }
# else {
# $nugetOrgPackageSource = Set-PackageSource -name 'nuget.org' -Trusted @PassThruParams
# if ($PassThruParams.Verbose) {
# write-verboseheader "Nuget.Org Package Source Info "
# $nugetOrgPackageSource | format-table | out-string | write-verbose
# }
# }

#Move to the Project Directory if we aren't there already
Set-Location $buildRoot
Expand Down Expand Up @@ -476,7 +476,7 @@ task PublishPSGallery -if (-not $SkipPublish) {
### SuperTasks
# These are the only supported items to run directly from Invoke-Build
task Deploy PreDeploymentChecks,Package,PublishGitHubRelease,PublishPSGallery
task Build Clean,CopyFilesToBuildDir,UpdateMetadata
task Build Clean, CopyFilesToBuildDir
task Test Pester

#Default Task - Build, Test with Pester, Deploy
Expand Down
6 changes: 2 additions & 4 deletions PowerHTML.psd1
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
RootModule = 'PowerHTML.psm1'

# Version number of this module.
ModuleVersion = '0.0.1'
ModuleVersion = '0.1.8'

# Supported PSEditions
# CompatiblePSEditions = @()
Expand Down Expand Up @@ -63,7 +63,7 @@ Description = 'Provides a wrapper for HTML Agility Pack for use where the IE HTM
# TypesToProcess = @()

# Format files (.ps1xml) to be loaded when importing this module
FormatsToProcess = @('.\Types\*.ps1xml')
FormatsToProcess = @('.\Types\HtmlAgilityPack.HtmlTextNode.ps1xml')

# Modules to import as nested modules of the module specified in RootModule/ModuleToProcess
# NestedModules = @()
Expand Down Expand Up @@ -120,5 +120,3 @@ PrivateData = @{
# DefaultCommandPrefix = ''

}


24 changes: 8 additions & 16 deletions PowerHTML.psm1
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#Get public and private function definition files.
$PublicFunctions = @( Get-ChildItem -Path $PSScriptRoot\Public\*.ps1 -ErrorAction SilentlyContinue )
$PrivateFunctions = @( Get-ChildItem -Path $PSScriptRoot\Private\*.ps1 -ErrorAction SilentlyContinue )
$PublicFunctions = @( Get-ChildItem -Path $PSScriptRoot\Public\*.ps1 -ErrorAction Ignore )
$PrivateFunctions = @( Get-ChildItem -Path $PSScriptRoot\Private\*.ps1 -ErrorAction Ignore )

#Get JSON settings files
$ModuleSettings = @( Get-ChildItem -Path $PSScriptRoot\Settings\*.json -ErrorAction SilentlyContinue )
$ModuleSettings = @( Get-ChildItem -Path $PSScriptRoot\Settings\*.json -ErrorAction Ignore )

#Determine which assembly versions to load
#See if .Net Standard 2.0 is available on the system and if not, load the legacy Net 4.0 library
Expand Down Expand Up @@ -40,23 +40,15 @@ if ($AssembliesToLoad) {
}

#Dot source the files
Foreach($FunctionToImport in @($PublicFunctions + $PrivateFunctions))
{
Try
{
foreach ($FunctionToImport in @($PublicFunctions + $PrivateFunctions)) {
try {
. $FunctionToImport.fullname
}
Catch
{
} catch {
Write-Error -Message "Failed to import function $($import.fullname): $_"
}
}

#Import Settings files as global objects based on their filename
foreach ($ModuleSettingsItem in $ModuleSettings)
{
foreach ($ModuleSettingsItem in $ModuleSettings) {
New-Variable -Name "$($ModuleSettingsItem.basename)" -Scope Global -Value (convertfrom-json (Get-Content -raw $ModuleSettingsItem.fullname)) -Force
}

#Export the public functions. This requires them to match the standard Noun-Verb powershell cmdlet format as a safety mechanism
Export-ModuleMember -Function ($PublicFunctions.Basename | where {$PSitem -match '^\w+-\w+$'})
}
193 changes: 90 additions & 103 deletions Public/ConvertFrom-HTML.ps1
Original file line number Diff line number Diff line change
@@ -1,122 +1,109 @@
<#
.SYNOPSIS
Takes an HTML input and converts it to an HTMLAgilityPack htmlNode object that can be navigated using Linq
.DESCRIPTION
Long description
.EXAMPLE
PS C:\> $HTMLString = @"
<!DOCTYPE html>
<html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>d
</body>
</html>
"@
PS C:\> $HTMLString | ConvertFrom-HTML -OutVariable result

NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
-------- ---- -------------- -------------- ------------- ---------
Document #document 0 4 103 …

PS C:\> $result.SelectSingleNode("//body/h1")

NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
-------- ---- -------------- -------------- ------------- ---------
Element h1 0 1 16 My First Heading

Convert HTML string to a HtmlNode via the pipeline.

.EXAMPLE
PS C:\> $uri = "https://www.powershellgallery.com/"
PS C:\> $result = ConvertFrom-HTML -uri $uri
PS C:\> $result

NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
-------- ---- -------------- -------------- ------------- ---------
Document #document 0 4 17550 …

Fetch and parse $uri directly via the URI pipeline.
.EXAMPLE
PS C:\> Get-Item $testFilePath | ConvertFrom-Html

NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
-------- ---- -------------- -------------- ------------- ---------
Document #document 0 5 105 …

Parse an HTML file piped from Get-Item.
.INPUTS
[String[]]
[System.IO.FileInfo[]]
.OUTPUTS
[HtmlAgilityPack.HtmlDocument]
[HtmlAgilityPack.HtmlNode]
.NOTES
General notes
#>

function ConvertFrom-Html {
[CmdletBinding(DefaultParameterSetName="String")]
param (
<#
.SYNOPSIS
Takes an HTML input and converts it to an HTMLAgilityPack htmlNode object that can be navigated using Linq
.DESCRIPTION
Long description
.EXAMPLE
$HTMLString = @'
<!DOCTYPE html>
<html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>d
</body>
</html>
'@ | ConvertFrom-HTML

$HTMLString

NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
-------- ---- -------------- -------------- ------------- ---------
Document #document 0 4 103 …

$HTMLString.SelectSingleNode('//body/h1')

NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
-------- ---- -------------- -------------- ------------- ---------
Element h1 0 1 16 My First Heading

Convert HTML string to a HtmlNode via the pipeline.

.EXAMPLE
$uri = [Uri]'https://www.powershellgallery.com/' | ConvertFrom-HTML
$uri

NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
-------- ---- -------------- -------------- ------------- ---------
Document #document 0 4 17550 …

Fetch and parse a url.
.EXAMPLE
Get-Item $testFilePath | ConvertFrom-Html

NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
-------- ---- -------------- -------------- ------------- ---------
Document #document 0 5 105 …

Parse an HTML file piped from Get-Item.
.INPUTS
[String[]]
[System.IO.FileInfo[]]
[System.URI[]]
.OUTPUTS
[HtmlAgilityPack.HtmlDocument]
[HtmlAgilityPack.HtmlNode]
.NOTES
General notes
#>
[OutputType([HtmlAgilityPack.HtmlNode])]
[OutputType([HtmlAgilityPack.HtmlDocument])]
[CmdletBinding(DefaultParameterSetName = 'String')]
param(
#The HTML text to parse. Accepts multiple separate documents as an array. This also accepts pipeline from Invoke-WebRequest
[Parameter(ParameterSetName="String",Mandatory,ValueFromPipeline,ValueFromPipelineByPropertyName,Position=0)]
[String[]]$Content,
[Parameter(ParameterSetName = 'String', Mandatory, ValueFromPipeline, ValueFromPipelineByPropertyName, Position = 0)]
[String[]] $Content,

#The URI or URIs from which to retrieve content. This may be faster than using Invoke-WebRequest but is less flexible in the method of retrieval (for instance, no POST)
[Parameter(ParameterSetName="URI",Mandatory,ValueFromPipeline,ValueFromPipelineByPropertyName)]
[System.URI[]]$URI,
[Parameter(ParameterSetName = 'URI', Mandatory, ValueFromPipeline, ValueFromPipelineByPropertyName, Position = 0)]
[System.URI[]] $URI,

#Path to file or files containing HTML content to convert. This accepts pipeline from Get-Childitem or Get-Item
[Parameter(ParameterSetName="Path",Mandatory,ValueFromPipeline,ValueFromPipelineByPropertyName)]
[System.IO.FileInfo[]]$Path,
[Parameter(ParameterSetName = 'Path', Mandatory, ValueFromPipeline, ValueFromPipelineByPropertyName, Position = 0)]
[System.IO.FileInfo[]] $Path,

#Do not return the Linq documentnode, instead return the HTMLDocument object. This is useful if you want to do XPath queries instead of Linq queries
[switch]$Raw

[switch] $Raw
)

begin {
$html = [HtmlAgilityPack.HtmlDocument]::new()
$web = [HtmlAgilityPack.HtmlWeb]::new()
}

process {
#Find the type of input and bind it to inputObject
$inputObject = $null
foreach ($contentType in "Content","URI","Path") {
if ((Get-Variable -erroraction SilentlyContinue $contentType).value) {
$inputObject = (Get-Variable $contentType).value
break
}
}
if (-not $inputObject) {write-error "Input Object Type Not Identified. If you see this then ConvertFrom-HTML needs better input validation"}

#Unwrap any arrays. This allows us to accept both pipeline and parameter input
$inputObject | ForEach-Object {
$inputItem = $PSItem
$htmlDoc = new-object HtmlAgilityPack.HtmlDocument

#Process all object types into a common HTML document format
switch ($inputItem.GetType().FullName) {
"System.String" {
$htmlDoc.LoadHtml($inputItem)
switch ($PSCmdlet.ParameterSetName) {
'String' {
$Content | ForEach-Object {
Write-Verbose "Loading HTML"
$html.LoadHtml($_)
if ($Raw) { $html } else { $html.DocumentNode }
}
"System.Uri" {
$htmlDoc = (new-object HtmlAgilityPack.HtmlWeb).Load($inputItem)
}
"System.IO.FileInfo" {
$htmlDoc.Load($inputItem)
}
Default {
write-error "Object Type not supported or implemented. If you see this error then ConvertFrom-HTML has improper input validation"
continue
}
'URI' {
$URI | ForEach-Object {
Write-Verbose "Loading URI $_"
$site = $web.Load($_)
if ($Raw) { $site } else { $site.DocumentNode }
}
}
if ($inputItem) {
if ($Raw) {
$htmlDoc
} else {
$htmlDoc.DocumentNode
'Path' {
$Path | ForEach-Object {
Write-Verbose "Loading File $_"
$html.Load($_.FullName)
if ($Raw) { $html } else { $html.DocumentNode }
}
}
}

}
}

}
Loading