Here is the example using XHR and RegEx to retrieve all table data from the webpage:
Option Explicit
Sub ExtractDataWunderground()
Dim aResult() As String
Dim sContent As String
Dim i As Long
Dim j As Long
' retrieve html content
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.wunderground.com/history/airport/EPGD/2016/10/24/DailyHistory.html", False
.Send
sContent = .ResponseText
End With
' parse with regex
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.IgnoreCase = True
' minor html simplification
.Pattern = "<span[^>]*>|</span>|[
]*"
sContent = .Replace(sContent, "")
' match each table row
.Pattern = "<tr><td class=""indent"">(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>"
With .Execute(sContent)
ReDim aResult(1 To .Count, 1 To 4)
' each row
For i = 1 To .Count
With .Item(i - 1)
' each cell
For j = 1 To 4
aResult(i, j) = DecodeHTMLEntities(.SubMatches(j - 1))
Next
End With
Next
End With
End With
' output result
Cells.Delete
Output Cells(1, 1), aResult
MsgBox "Completed"
End Sub
Function DecodeHTMLEntities(sText As String) As String
Static oHtmlfile As Object
Static oDiv As Object
If oHtmlfile Is Nothing Then
Set oHtmlfile = CreateObject("htmlfile")
oHtmlfile.Open
Set oDiv = oHtmlfile.createElement("div")
End If
oDiv.innerHTML = sText
DecodeHTMLEntities = oDiv.innerText
End Function
Sub Output(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1 _
)
.NumberFormat = "@"
.Value = aCells
.Columns.AutoFit
End With
End With
End Sub
The output is as follows for me:
To extract the mean temperature only you can get the value from the first match having 0 index, since the mean temperature is in the first row of the table:
Sub ExtractMeanTempWunderground()
Dim sContent As String
' retrieve html content
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.wunderground.com/history/airport/EPGD/2016/10/24/DailyHistory.html", False
.Send
sContent = .ResponseText
End With
' parse with regex
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.IgnoreCase = True
' minor html simplification
.Pattern = "<span[^>]*>|</span>|[
]*"
sContent = .Replace(sContent, "")
' match each table row
.Pattern = "<tr><td class=""indent"">.*?</td><td>(.*?)</td><td>.*?</td><td>.*?</td></tr>"
With .Execute(sContent)
If .Count = 15 Then
' get the first row value only
MsgBox DecodeHTMLEntities(.Item(0).SubMatches(0))
Else
MsgBox "Data structure inconsistence detected"
End If
End With
End With
End Sub
Function DecodeHTMLEntities(sText As String) As String
Static oHtmlfile As Object
Static oDiv As Object
If oHtmlfile Is Nothing Then
Set oHtmlfile = CreateObject("htmlfile")
oHtmlfile.Open
Set oDiv = oHtmlfile.createElement("div")
End If
oDiv.innerHTML = sText
DecodeHTMLEntities = oDiv.innerText
End Function
Note, such methods will work until the webpage structure is changed.
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…