用 VBA 解析 HTML
问题:用 VBA 解析 HTML 我正在尝试从网站的大约 500 个网址中提取数据。所有页面的结构都相同。我在理解这个特定网站的 HTML 时遇到了问题 https://www.coworker.com/s-f/6033/united-states_hawaii_honolulu_impact-hub-honolulu 我想提取姓名、地址、电话和网站。我当前的代码: Sub GetData() D
·
问题:用 VBA 解析 HTML
我正在尝试从网站的大约 500 个网址中提取数据。所有页面的结构都相同。我在理解这个特定网站的 HTML 时遇到了问题
https://www.coworker.com/s-f/6033/united-states_hawaii_honolulu_impact-hub-honolulu
我想提取姓名、地址、电话和网站。我当前的代码:
Sub GetData()
Dim wsSheet As Worksheet, Rows As Long, links As Variant, IE As Object, link As Variant
Set wb = ThisWorkbook
Set wsSheet = wb.Sheets("Sheet1")
Set IE = New InternetExplorer
Rows = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
links = wsSheet.Range("A1:A" & Rows)
With IE
.Visible = True
For Each link In links
.navigate (link)
While .Busy Or .readyState <> 4: DoEvents: Wend
Next
End With
End Sub
解答
干得好。没有更多的链接来测试这个是非常脆弱的。它在很大程度上依赖于跨页面的一致样式。
XHR Looping 链接列表:
Option Explicit
Public Sub GetInfo()
Dim wsSheet As Worksheet, Rows As Long, links(), link As Long, wb As Workbook, html As HTMLDocument
Set wb = ThisWorkbook: Set wsSheet = wb.Sheets("Sheet1")
Application.ScreenUpdating = False
With wsSheet
Rows = .Cells(.Rows.Count, "A").End(xlUp).Row
If Rows = 1 Then
ReDim links(1 To 1, 1 To 1)
links(1, 1) = wsSheet.Range("A1")
Else
links = wsSheet.Range("A1:A" & Rows).Value
End If
Dim r As Long
For link = LBound(links, 1) To UBound(links, 1)
r = r + 1
Set html = GetHTML(links(link, 1))
On Error Resume Next
Dim aNodeList As Object: Set aNodeList = html.querySelectorAll(".col-xs-12.pade_none.muchroom_mail")
.Cells(r, 2) = "Name: " & html.querySelector(".col-sm-9.col-md-9.col-xs-12.pade_none.kohub_space_headings h2").innerText
.Cells(r, 3) = "Address: " & aNodeList.item(0).innerText
.Cells(r, 4) = "Tel: " & aNodeList.item(1).innerText
.Cells(r, 5) = "Website: " & html.querySelector(".website-link-text a[href]").getAttribute("href")
On Error GoTo 0
Next link
End With
Application.ScreenUpdating = True
End Sub
Public Function GetHTML(ByVal url As String) As HTMLDocument
Dim sResponse As String, html As New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", url, False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
With html
.body.innerHTML = sResponse
End With
Set GetHTML = html
End Function
输出:
参考(VBE>工具>参考):
- HTML对象库
Internet Explorer:
Option Explicit
Public Sub GetInfo()
Dim wsSheet As Worksheet, Rows As Long, links(), link As Long, wb As Workbook, html As HTMLDocument, ie As InternetExplorer
Set wb = ThisWorkbook: Set wsSheet = wb.Sheets("Sheet1")
Application.ScreenUpdating = False
With wsSheet
Rows = .Cells(.Rows.Count, "A").End(xlUp).Row
If Rows = 1 Then
ReDim links(1, 1)
links(1, 1) = wsSheet.Range("A1")
Else
links = wsSheet.Range("A1:A" & Rows).Value
End If
Dim r As Long
Set ie = New InternetExplorer
ie.Visible = True
For link = LBound(links, 1) To UBound(links, 1)
ie.navigate links(link, 1)
While ie.Busy Or ie.readyState < 4: DoEvents: Wend
' Application.Wait Now + TimeSerial(0, 0, 10)
On Error Resume Next
r = r + 1: Set html = ie.document
.Cells(r, 2) = "Name: " & html.querySelector(".col-sm-9.col-md-9.col-xs-12.pade_none.kohub_space_headings h2").innerText
.Cells(r, 3) = "Address: " & html.querySelector(".col-xs-12.pade_none.muchroom_mail").innerText
.Cells(r, 4) = "Tel: " & html.querySelector(".fa.fa-phone.fa-rotate-270 ~ a").innerText
.Cells(r, 5) = "Website: " & html.querySelector(".website-link-text a[href]").getAttribute("href")
On Error GoTo 0
Next link
ie.Quit
End With
Application.ScreenUpdating = True
End Sub
参考(VBE>工具>参考):
-
HTML对象库
-
微软互联网控制
更多推荐
已为社区贡献126473条内容
所有评论(0)