Skip to content

Commit

Permalink
fixes JustinGrote#7, remove net4.0, refactor psm1
Browse files Browse the repository at this point in the history
  • Loading branch information
trackd committed Jul 5, 2024
1 parent a74a655 commit 8567b24
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 56 deletions.
6 changes: 3 additions & 3 deletions PowerHTML.psd1
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ RootModule = 'PowerHTML.psm1'
ModuleVersion = '0.2.0'

# Supported PSEditions
# CompatiblePSEditions = @()
CompatiblePSEditions = 'Core', 'Desktop'

# ID used to uniquely identify this module
GUID = '5be91c3d-59a7-469b-bca7-bdc619347b64'
Expand All @@ -33,7 +33,7 @@ Copyright = '(c) 2024 Justin Grote. All rights reserved.'
Description = 'Provides a wrapper for HTML Agility Pack for use where the IE HTML DOM from Invoke-WebRequest is not available such as Powershell Core'

# Minimum version of the Windows PowerShell engine required by this module
# PowerShellVersion = ''
PowerShellVersion = '5.1'

# Name of the Windows PowerShell host required by this module
# PowerShellHostName = ''
Expand All @@ -54,7 +54,7 @@ Description = 'Provides a wrapper for HTML Agility Pack for use where the IE HTM
# RequiredModules = @()

# Assemblies that must be loaded prior to importing this module
#RequiredAssemblies = '.\lib\HtmlAgilityPack-1.7.0-netstandard2.dll'
RequiredAssemblies = @('.\lib\HtmlAgilityPack-1.11.60-netstandard2.dll')

# Script files (.ps1) that are run in the caller's environment prior to importing this module.
# ScriptsToProcess = @()
Expand Down
47 changes: 2 additions & 45 deletions PowerHTML.psm1
Original file line number Diff line number Diff line change
@@ -1,54 +1,11 @@
#Get public and private function definition files.
$PublicFunctions = @( Get-ChildItem -Path $PSScriptRoot\Public\*.ps1 -ErrorAction Ignore )
$PrivateFunctions = @( Get-ChildItem -Path $PSScriptRoot\Private\*.ps1 -ErrorAction Ignore )

#Get JSON settings files
$ModuleSettings = @( Get-ChildItem -Path $PSScriptRoot\Settings\*.json -ErrorAction Ignore )

#Determine which assembly versions to load
#See if .Net Standard 2.0 is available on the system and if not, load the legacy Net 4.0 library
try {
Add-Type -AssemblyName 'netstandard, Version=2.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51' -ErrorAction Stop
#If netstandard is not available it won't get this far
$dotNetTarget = "netstandard2"
} catch {
$dotNetTarget = "net40-client"
}

$AssembliesToLoad = Get-ChildItem -Path "$PSScriptRoot\lib\*-$dotNetTarget.dll"
if ($AssembliesToLoad) {
#If we are in a build or a pester test, load assemblies from a temporary file so they don't lock the original file
#This helps to prevent cleaning problems due to a powershell session locking the file because unloading a module doesn't unload assemblies
if ($BuildTask -or $TestDrive) {
write-verbose "Detected Invoke-Build or Pester, loading assemblies from a temp location to avoid locking issues"
if ($Global:BuildAssembliesLoadedPreviously) {
write-warning "You are in a build or test environment. We detected that module assemblies were loaded in this same session on a previous build or test. Strongly recommend you kill the process and start a new session for a clean build/test!"
}

$TempAssembliesToLoad = @()
foreach ($AssemblyPathItem in $AssembliesToLoad) {
$TempAssemblyPath = [System.IO.Path]::GetTempFileName() + ".dll"
Copy-Item $AssemblyPathItem $TempAssemblyPath
$TempAssembliesToLoad += [System.IO.FileInfo]$TempAssemblyPath
}
$AssembliesToLoad = $TempAssembliesToLoad
$Global:BuildAssembliesLoadedPreviously = $true
}

write-verbose "Loading Assemblies for .NET target: $dotNetTarget"
Add-Type -Path $AssembliesToLoad.fullname -ErrorAction Stop
}

#Dot source the files
foreach ($FunctionToImport in @($PublicFunctions + $PrivateFunctions)) {
foreach ($FunctionToImport in $PublicFunctions) {
try {
. $FunctionToImport.fullname
} catch {
Write-Error -Message "Failed to import function $($import.fullname): $_"
Write-Error -Message "Failed to import function $($FunctionToImport.fullname)"
}
}

#Import Settings files as global objects based on their filename
foreach ($ModuleSettingsItem in $ModuleSettings) {
New-Variable -Name "$($ModuleSettingsItem.basename)" -Scope Global -Value (convertfrom-json (Get-Content -raw $ModuleSettingsItem.fullname)) -Force
}
25 changes: 17 additions & 8 deletions Public/ConvertFrom-HTML.ps1
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@

function ConvertFrom-Html {
function ConvertFrom-Html {
<#
.SYNOPSIS
Takes an HTML input and converts it to an HTMLAgilityPack htmlNode object that can be navigated using Linq
Expand Down Expand Up @@ -57,28 +56,32 @@ function ConvertFrom-Html {
.NOTES
General notes
#>
[OutputType([HtmlAgilityPack.HtmlNode])]
[OutputType([HtmlAgilityPack.HtmlDocument])]
[OutputType([HtmlAgilityPack.HtmlNode], [HtmlAgilityPack.HtmlDocument])]
[CmdletBinding(DefaultParameterSetName = 'String')]
param(
#The HTML text to parse. Accepts multiple separate documents as an array. This also accepts pipeline from Invoke-WebRequest
#The HTML text to parse. Accepts multiple separate documents as an array.
[Parameter(ParameterSetName = 'String', Mandatory, ValueFromPipeline, ValueFromPipelineByPropertyName, Position = 0)]
[String[]] $Content,

#The URI or URIs from which to retrieve content. This may be faster than using Invoke-WebRequest but is less flexible in the method of retrieval (for instance, no POST)
[Parameter(ParameterSetName = 'URI', Mandatory, ValueFromPipeline, ValueFromPipelineByPropertyName, Position = 0)]
[Alias('URL')]
[System.URI[]] $URI,

#Path to file or files containing HTML content to convert. This accepts pipeline from Get-Childitem or Get-Item
[Parameter(ParameterSetName = 'Path', Mandatory, ValueFromPipeline, ValueFromPipelineByPropertyName, Position = 0)]
[System.IO.FileInfo[]] $Path,

#The web response object from Invoke-WebRequest. This is used to extract the content from the web response object
[Parameter(ParameterSetName = 'WebResponse', Mandatory, ValueFromPipeline, ValueFromPipelineByPropertyName, Position = 0)]
[Microsoft.PowerShell.Commands.WebResponseObject[]] $WebResponse,

#Do not return the Linq documentnode, instead return the HTMLDocument object. This is useful if you want to do XPath queries instead of Linq queries
[switch] $Raw
)
begin {
$html = [HtmlAgilityPack.HtmlDocument]::new()
$web = [HtmlAgilityPack.HtmlWeb]::new()
$html = [HtmlAgilityPack.HtmlDocument]::new()
$web = [HtmlAgilityPack.HtmlWeb]::new()
}
process {
switch ($PSCmdlet.ParameterSetName) {
Expand All @@ -103,7 +106,13 @@ function ConvertFrom-Html {
if ($Raw) { $html } else { $html.DocumentNode }
}
}
'WebResponse' {
$WebResponse | ForEach-Object {
Write-Verbose "Loading WebResponse"
$html.LoadHtml($_.Content)
if ($Raw) { $html } else { $html.DocumentNode }
}
}
}
}

}
29 changes: 29 additions & 0 deletions Tests/PowerHTML.Tests.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,33 @@ Describe 'HTTP Operational Tests - REQUIRES INTERNET CONNECTION!' {
$result[1].innertext | Should -Match 'Facebook'
$result[2].innertext | Should -Match 'X\.com'
}
It 'can parse special characters in HTML' {
# not sure how stable this site is, happy to change to a different source or if i could get the mock working properly.
$Result = Invoke-WebRequest -Uri "https://www.compart.com/en/unicode/U+00FC" | ConvertFrom-Html
$result | Should -BeOfType HtmlAgilityPack.HTMLNode
$Result.SelectNodes('//span[@class="box"]').InnerText | Should -Be ([char]0x00FC)
}
}

<#
# couldnt get the proper mocked webresponse object to work
Describe 'Testing Encoding' {
BeforeAll {
Add-Type -AssemblyName System.Net.Http
$RM = [System.Net.Http.HttpResponseMessage]::new()
$RM.StatusCode = 200
$RM.Content = [System.Net.Http.StringContent]::new('<html><body><span class="box">ü</span></body></html>')
$RM.Content.Headers.ContentType = [System.Net.Http.Headers.MediaTypeHeaderValue]::new('text/html')
$RM.Content.Headers.ContentType.CharSet = 'utf-8'
$Stream = $RM.Content.ReadAsStreamAsync().Result
$ct = [System.Threading.CancellationToken]::None
$ts = New-TimeSpan -Seconds 1
$wro = [Microsoft.PowerShell.Commands.WebResponseObject]::New($RM, $Stream, $ts, $ct)
}
It 'Can parse special characters in HTML' {
$Result = ConvertFrom-Html -WebResponse $webResponseObject
$result | Should -BeOfType HtmlAgilityPack.HTMLNode
$Result.SelectNodes('//span[@class="box"]').InnerText | Should -Be ([char]0x00FC)
}
}
#>
Binary file removed lib/HtmlAgilityPack-1.11.60-net40-client.dll
Binary file not shown.

0 comments on commit 8567b24

Please sign in to comment.