问题:用 VBA 解析 HTML

我正在尝试从网站的大约 500 个网址中提取数据。所有页面的结构都相同。我在理解这个特定网站的 HTML 时遇到了问题

https://www.coworker.com/s-f/6033/united-states_hawaii_honolulu_impact-hub-honolulu

我想提取姓名、地址、电话和网站。我当前的代码:

Sub GetData()
    Dim wsSheet As Worksheet, Rows As Long, links As Variant, IE As Object, link As Variant
    Set wb = ThisWorkbook
    Set wsSheet = wb.Sheets("Sheet1")

    Set IE = New InternetExplorer

    Rows = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
    links = wsSheet.Range("A1:A" & Rows)

    With IE
        .Visible = True
        For Each link In links
            .navigate (link)
            While .Busy Or .readyState <> 4: DoEvents: Wend

        Next
    End With
End Sub

解答

干得好。没有更多的链接来测试这个是非常脆弱的。它在很大程度上依赖于跨页面的一致样式。


XHR Looping 链接列表:

Option Explicit
Public Sub GetInfo()
    Dim wsSheet As Worksheet, Rows As Long, links(), link As Long, wb As Workbook, html As HTMLDocument
    Set wb = ThisWorkbook: Set wsSheet = wb.Sheets("Sheet1")
    Application.ScreenUpdating = False
   
    With wsSheet
        Rows = .Cells(.Rows.Count, "A").End(xlUp).Row
        If Rows = 1 Then
            ReDim links(1 To 1, 1 To 1)
            links(1, 1) = wsSheet.Range("A1")
        Else
            links = wsSheet.Range("A1:A" & Rows).Value
        End If
        Dim r As Long
        For link = LBound(links, 1) To UBound(links, 1)
            r = r + 1
            Set html = GetHTML(links(link, 1))
            On Error Resume Next
            Dim aNodeList As Object: Set aNodeList = html.querySelectorAll(".col-xs-12.pade_none.muchroom_mail")
            .Cells(r, 2) = "Name: " & html.querySelector(".col-sm-9.col-md-9.col-xs-12.pade_none.kohub_space_headings h2").innerText
            .Cells(r, 3) = "Address: " & aNodeList.item(0).innerText
            .Cells(r, 4) = "Tel: " & aNodeList.item(1).innerText
            .Cells(r, 5) = "Website: " & html.querySelector(".website-link-text a[href]").getAttribute("href")
             On Error GoTo 0
        Next link
    End With
    Application.ScreenUpdating = True
End Sub

Public Function GetHTML(ByVal url As String) As HTMLDocument
    Dim sResponse As String, html As New HTMLDocument
    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", url, False
        .send
        sResponse = StrConv(.responseBody, vbUnicode)
    End With
    sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
   
    With html
        .body.innerHTML = sResponse
    End With
    Set GetHTML = html
End Function

输出:

输出


参考(VBE>工具>参考):

  1. HTML对象库

Internet Explorer:

Option Explicit
Public Sub GetInfo()
    Dim wsSheet As Worksheet, Rows As Long, links(), link As Long, wb As Workbook, html As HTMLDocument, ie As InternetExplorer
    Set wb = ThisWorkbook: Set wsSheet = wb.Sheets("Sheet1")
    Application.ScreenUpdating = False
    With wsSheet
        Rows = .Cells(.Rows.Count, "A").End(xlUp).Row
        If Rows = 1 Then
            ReDim links(1, 1)
            links(1, 1) = wsSheet.Range("A1")
        Else
            links = wsSheet.Range("A1:A" & Rows).Value
        End If
        Dim r As Long
        
        Set ie = New InternetExplorer
        ie.Visible = True

        For link = LBound(links, 1) To UBound(links, 1)
            ie.navigate links(link, 1)
            While ie.Busy Or ie.readyState < 4: DoEvents: Wend
          '  Application.Wait Now + TimeSerial(0, 0, 10)
            On Error Resume Next
            r = r + 1: Set html = ie.document
            .Cells(r, 2) = "Name: " & html.querySelector(".col-sm-9.col-md-9.col-xs-12.pade_none.kohub_space_headings h2").innerText
            .Cells(r, 3) = "Address: " & html.querySelector(".col-xs-12.pade_none.muchroom_mail").innerText
            .Cells(r, 4) = "Tel: " & html.querySelector(".fa.fa-phone.fa-rotate-270 ~ a").innerText
            .Cells(r, 5) = "Website: " & html.querySelector(".website-link-text a[href]").getAttribute("href")
            On Error GoTo 0
        Next link
        ie.Quit
    End With
    Application.ScreenUpdating = True
End Sub

参考(VBE>工具>参考):

  1. HTML对象库

  2. 微软互联网控制


Logo

学AI,认准AI Studio!GPU算力,限时免费领,邀请好友解锁更多惊喜福利 >>>

更多推荐