使用 Python 抓取多个 Google 答案框布局

Mangs

7人浏览 · 2022-08-16 21:21:42

Mangs · 2022-08-16 21:21:42 发布

这篇博文介绍了如何使用 Python 抓取:天气、股票、转换器、计算器、电话号码、地址、人口、翻译、字典、匹配、航班、公式和有机多布局答案框结果。

注意:此处并未涵盖所有可能的布局。其中一些可能只是我没有看到。

内容:

计算器答案框
天气答案框
股票答题箱
人口回答框
转换器答题盒
字典答案框
第一个有机结果答案框
第二个有机结果答案框
第三个有机结果答案框
翻译答案框
公式答案框
航班答疑箱
匹配答案框
地址答题框
使用 SerpApi
链接

zwz 100021 zwz 100072 其他 zwz 100073 zwz 100071

简介

您将看到if/elif/else、try/except、list理解的用法,zip()函数的用法。简单的东西,但是如果您对此有点陌生,请务必查看其中一些内容在做什么,因为不会有任何适当的解释。将展示另一种 API 解决方案。

先决条件

熟悉beautifulsoup、requests、selenium库和CSS选择器。

要获取CSS选择器,请查看SelectorsGadgetChrome 扩展,通过单击浏览器中所需的元素轻松获取CSS选择器。CSS选择器参考。

刮谷歌计算器答案框

import requests, lxml
from bs4 import BeautifulSoup

headers = {
  "User-agent":
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
  "q": "32*3/3+12*332-1995",
}

def get_calculator_answerbox():
  html = requests.get('https://www.google.com/search', headers=headers, params=params)
  soup = BeautifulSoup(html.text, 'lxml')

  math_expression = soup.select_one('.XH1CIc').text.strip().replace(' =', '')
  calc_answer = soup.select_one('#cwos').text.strip()

  print(f"Expression: {math_expression}\nAnswer: {calc_answer}")


get_calculator_answerbox()

'''
Expression: ((32 * 3) / 3) + (12 * 332) - 1995
Answer: 2021
'''

刮谷歌天气答案框

from bs4 import BeautifulSoup
import requests, lxml

headers = {
  "User-Agent":
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
  "q": "london weather", # query
  "gl": "uk"             # country to search from (United Kingdom)
}

def get_weather_answerbox():
    response = requests.get('https://www.google.com/search', headers=headers, params=params)
    soup = BeautifulSoup(response.text, 'lxml')

    location = soup.select_one('#wob_loc').text
    weather_condition = soup.select_one('#wob_dc').text
    tempature = soup.select_one('#wob_tm').text
    precipitation = soup.select_one('#wob_pp').text
    humidity = soup.select_one('#wob_hm').text
    wind = soup.select_one('#wob_ws').text
    current_time = soup.select_one('#wob_dts').text

    print(f'Location: {location}\n'
          f'Weather condition: {weather_condition}\n'
          f'Temperature: {tempature}°C\n'
          f'Precipitation: {precipitation}\n'
          f'Humidity: {humidity}\n'
          f'Wind speed: {wind}\n'
          f'Current time: {current_time}\n')

    print('Forcast wind:')
    for wind_speed_direction in soup.select('.wob_noe .wob_hw'):
      try:
        wind_speed = wind_speed_direction.select_one('.wob_t').text

        '''
        extracts elemenets, splits the string by a SPACE and grabs 2nd and 4th index,
        and then joins via SPACE.

        Example:
        7 mph From northwest Sunday 9:00 AM ---> From northeast
        '''
        wind_direction = ' '.join(wind_speed_direction.select_one('.wob_t')['aria-label'].split(' ')[2:4])

        print(f"Wind Speed: {wind_speed}\nWind Direction: {wind_direction}\n")
      except:
        pass # or return None instead

    print('Forcast temperature:')
    for forecast in soup.select('.wob_df'):
      day = forecast.select_one('.Z1VzSb')['aria-label']
      weather = forecast.select_one('img.uW5pk')['alt']

      # check if selector exists in "current" HTML layout
      if forecast.select_one('.vk_gy .wob_t:nth-child(1)') is None:
        # different HTML layout selector
        high_temp = forecast.select_one('.gNCp2e .wob_t').text
      else:
        # different HTML layout selector
        high_temp = forecast.select_one('.vk_gy .wob_t:nth-child(1)').text

      low_temp = forecast.select_one('.QrNVmd .wob_t:nth-child(1)').text

      print(f'Day: {day}\nWeather: {weather}\nHigh: {high_temp}, Low: {low_temp}\n')


    print("Forecast Precipitation:")
    for recipitation_forecast in soup.select('.wob_hw'):
      try:
        recipitation = recipitation_forecast.select_one('.XwOqJe')['aria-label'].split(' ')[0]
      except: recipitation = None
      print(recipitation)


get_weather_answerbox()

---------
'''
Location: London, UK
Weather condition: Partly cloudy
Temperature: 20°C
Precipitation: 0%
Humidity: 55%
Wind speed: 7 mph
Current time: Monday 15:00

Forcast wind:
Wind Speed: 8 mph
Wind Direction: From east
...
Forcast temperature:
Day: Monday
Weather: Partly cloudy
High: 21, Low: 15
...
Forecast Precipitation:
0%
1%
1%
2%
2%
4%
12%
'''

刮谷歌股票答案框

当前有两种布局,具体取决于发起搜索的国家/地区('gl'='COUNTRY'查询参数):

import requests, lxml
from bs4 import BeautifulSoup

headers = {
  'User-agent':
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582'
}

stock_queries = [
  'netflix stock',
  'amazon stock',
  'alibaba stock',
  'cloudflare stock',
  'apple stock',
  'walmart stock'
]


def get_stock_answerbox():

  for query in stock_queries:
    params = {
      'q': query,
      'gl': 'us'
    }

    html = requests.get('https://www.google.com/search', headers=headers, params=params)
    soup = BeautifulSoup(html.text, 'lxml')

    title = soup.select_one('.oPhL2e').text.replace(u'\xa0', u'').split('>')[1]
    link = soup.select_one('.tiS4rf')['href']
    date_time = soup.select_one('[jsname=ihIZgd]').text.replace(' ·', '')
    market_status = soup.select_one('.TgMHGc span:nth-child(1)').text.strip().replace(':', '')
    currency = soup.select_one('.knFDje').text.strip()

    # two selectors which will handle two layouts
    current_price = soup.select_one('.wT3VGc, .XcVN5d').text

    price_change = soup.select_one('.WlRRw > span:nth-child(1)').text
    price_change_percent = soup.select_one('.jBBUv span:nth-child(1)').text.replace('(', '').replace(')', '')
    price_change_date = soup.select_one('.jdUcZd span').text.strip().capitalize()

    price_movement = 'Down' if '−' in price_change else 'Up'

    # different exchange & stock ticker layout handling (US website has a different stock layout)
    if soup.select_one('.HfMth') is None:
      exchange = soup.select_one('.EFkvDd').text
      stock_ticker = soup.select_one('.WuDkNe').text
    else:
      stock_exchange_ticker = soup.select_one('.HfMth').text
      exchange = stock_exchange_ticker.split(': ')[0]
      stock_ticker = stock_exchange_ticker.split(': ')[1]

    print(f'\nTitle: {title}\n'
        f'Link: {link}\n'
        f'Stock status: {market_status}\n'
        f'Current time: {date_time}\n'
        f'Exchange: {exchange}\n'
        f'Stock ticker: {stock_ticker}\n'
        f'Currency: {currency}\n'
        f'Current price: {current_price}\n'
        f'Price change: {price_change}\n'
        f'Percent price change: {price_change_percent}\n'
        f'Price movement: {price_movement}\n'
        f'Price change date: {price_change_date}\n')

    for stock_table_key, stock_table_value in zip(soup.select('.JgXcPd'), soup.select('.iyjjgb')):
        stock_key = stock_table_key.text
        stock_value = stock_table_value.text
        print(f"{stock_key}: {stock_value}")


get_stock_answerbox()

-------------
'''
Title: Netflix Inc
Link: https://www.google.com/finance/quote/NFLX:NASDAQ?sa=X&ved=2ahUKEwj1npu7oP7yAhWEcc0KHT1uDiEQ3ecFegQIGxAS
Stock status: Closed
Current time: Sep 14, 5:42 AM EDT 
Exchange: NASDAQ
Stock ticker: NFLX
Currency: USD
Current price: 589.29
Price change: −9.43 
Percent price change: 1.58%
Price movement: Down
Price change date: Today

Open: 598.57
High: 598.57
Low: 582.78
Mkt cap: 260.82B
P/E ratio: 61.10
Div yield: -
Prev close: 598.72
52-wk high: 615.60
52-wk low: 458.60

Title: Amazon.com, Inc.
Link: https://www.google.com/finance/quote/AMZN:NASDAQ?sa=X&ved=2ahUKEwjeoOW7oP7yAhVRK80KHZoSCSEQ3ecFegQIKRAS
Stock status: Closed
Current time: Sep 13, 6:54 PM EDT 
Exchange: NASDAQ
Stock ticker: AMZN
Currency: USD
Current price: 3,457.17
Price change: −11.98 
Percent price change: 0.35%
Price movement: Down
Price change date: Today

Open: 3,482.80
High: 3,497.96
Low: 3,438.00
Mkt cap: 1.75T
P/E ratio: 60.25
Div yield: -
52-wk high: 3,773.08
52-wk low: 2,871.00
...
'''

刮谷歌人口回答框

import requests, lxml
from bs4 import BeautifulSoup

headers = {
  "User-agent":
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
  "q": "population of london",
}

def get_population_answerbox():
  html = requests.get('https://www.google.com/search', headers=headers, params=params)
  soup = BeautifulSoup(html.text, 'lxml')

  place = soup.select_one('.GzssTd span').text
  population_year = soup.select_one('.KBXm4e').text.split(' ')
  population = population_year[0]
  year = population_year[1].replace('(', '').replace(')', '')

  explore_more_link = soup.select_one('.tiS4rf')['href']
  sources = [source.text for source in soup.select('.kno-ftr span a')]

  print(f'{place}\nCurrent population: {population}\nCaptured in {year}\n{sources}\n{explore_more_link}\n')

  for other_city, other_population in zip(soup.select('.AleqXe'), soup.select('.kpd-lv')):
    other_place_city = other_city.text.strip()
    other_place_population = other_population.text

    print(f'{other_place_city}: {other_place_population}')


get_population_answerbox()

-----------
'''
London
Current population: 8.982 million
Captured in 2019
['Eurostat', 'United States Census Bureau', 'Feedback']
https://datacommons.org/place?utm_medium=explore&dcid=nuts/UKI&mprop=count&popt=Person&hl=en

London: 8.982 million‌
New York: 8.419 million‌
Scotland: 5.454 million‌
'''

刮谷歌转换器答案框

import requests, lxml
from bs4 import BeautifulSoup

headers = {
  "User-agent":
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
  "q": "100 usd in gbp",
    "gl": "us"
}

def get_converter_answerbox():
  html = requests.get('https://www.google.com/search', headers=headers, params=params)
  soup = BeautifulSoup(html.text, 'lxml')

  conversion = soup.select_one('.SwHCTb').text
  conversion_currency = soup.select_one('.MWvIVe').text
  print(f"{conversion} {conversion_currency}")

get_converter_answerbox()

# 72.16 Pound sterling

刮谷歌词典答案框

import requests, lxml
from bs4 import BeautifulSoup

headers = {
  'User-agent':
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582'
}

params = {
  'q': 'swagger definition',
  'gl': 'us'
}

def get_dictionary_answerbox():
    html = requests.get('https://www.google.com/search', headers=headers, params=params)
    soup = BeautifulSoup(html.text, 'lxml')

    data = []

    for result in soup.select('.VpH2eb.vmod'):
        syllables = result.select_one('.DgZBFd span').text
        audio_link = f"https:{result.select_one('.brWULd audio source')['src']}"
        phonetic = result.select_one('.S23sjd .LTKOO span').text
        word_types = [word_type.text for word_type in result.select('.vdBwhd .YrbPuc')]

        definitions = [definition.text for definition in result.select('.LTKOO div[data-dobid=dfn]')]
        sentence_examples = [example.text for example in result.select('.ubHt5c')]
        similar_words = [similar_word.text for similar_word in result.select('.p9F8Cd span')]

        data.append({
          'syllables': syllables,
          'audio_link': audio_link,
          'phonetic': phonetic,
          'examples': {
              'word_type': [word_types],
              'definitions': [definitions],
              'sentence_examples': [sentence_examples],
              'similar_words': [similar_words]
          }
        })

    print(json.dumps(data, indent = 2, ensure_ascii = False))

get_dictionary_answerbox()

---------
'''
[
  {
    "syllables": "swag·ger",
    "audio_link": "https://ssl.gstatic.com/dictionary/static/sounds/20200429/swagger--_us_1.mp3",
    "phonetic": "ˈswaɡər",
    "examples": {
      "word_type": [
        [
          "verb",
          "noun",
          "adjective"
        ]
      ],
      "definitions": [
        [
          "walk or behave in a very confident and typically arrogant or aggressive way.",
          "a very confident and typically arrogant or aggressive gait or manner.",
          "denoting a coat or jacket cut with a loose flare from the shoulders."
        ]
      ],
      "sentence_examples": [
        [
          "\"he swaggered along the corridor\"",
          "\"they strolled around the camp with an exaggerated swagger\""
        ]
      ],
      "similar_words": [
        [
          "strut",
          "parade",
          "stride",
          "roll",
          "prance",
          "sashay",
          "swash",
          "boast",
          "brag",
          "bray",
          "bluster",
          "crow",
          "gloat",
          "posture",
          "pose",
          "blow one's own trumpet",
          "show off",
          "swank",
          "play to the gallery",
          "rodomontade",
          "strut",
          "parading",
          "roll",
          "prancing",
          "confidence",
          "arrogance",
          "self-assurance",
          "show",
          "ostentation",
          "boasting",
          "bragging",
          "bluster",
          "swashbuckling",
          "vainglory",
          "puffery",
          "swank",
          "braggadocio",
          "rodomontade",
          "gasconade"
        ]
      ]
    }
  }
]

'''

刮谷歌第一个有机答案框

import requests, lxml
from bs4 import BeautifulSoup

headers = {
  'User-agent':
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582'
}

params = {
    'q': 'luke skywalker lightsaber color',
    'gl': 'us'
}

def get_organic_result_answerbox():
  html = requests.get('https://www.google.com/search', headers=headers, params=params)
  soup = BeautifulSoup(html.text, 'lxml')

  answer = soup.select_one('.XcVN5d').text
  title = soup.select_one('.DKV0Md').text
  link = soup.select_one('.yuRUbf a')['href']
  snippet = soup.select_one('.hgKElc').text
  print(f"{answer}\n{title}\n{link}\n{snippet}")

get_organic_result_answerbox()

----------
'''
Green
Luke Skywalker's Lightsaber (Green Blade) | StarWars.com
https://www.starwars.com/databank/luke-skywalkers-lightsaber
After losing his father's lightsaber on Cloud City, Luke Skywalker constructed a replacement with a green plasma blade, its hilt similar to that of Obi-Wan Kenobi's weapon.
'''

刮谷歌第二个有机答案框

import requests, lxml
from bs4 import BeautifulSoup

headers = {
  'User-agent':
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582'
}

params = {
    'q': 'Who is using IBM',
    'gl': 'us'
}

def get_second_organic_result_answerbox():
  html = requests.get('https://www.google.com/search', headers=headers, params=params)
  soup = BeautifulSoup(html.text, 'lxml')

  title = soup.select_one('.xpdopen .DKV0Md').text
  link = soup.select_one('.xpdopen .yuRUbf a')['href']
  displayed_link = soup.select_one('.xpdopen .iUh30').text

  # snippet layout handling + bulletpoints
  # example: shorturl.at/kIJQ9
  if soup.select_one('.xpdopen .co8aDb b') and soup.select_one('.TrT0Xe') is not None:
    snippet = soup.select_one('.xpdopen .co8aDb b').text
    bullet_points = '\n'.join([bullet_point.text for bullet_point in soup.select('.TrT0Xe')])
  else: 
    snippet = soup.select_one('.xpdopen .iKJnec').text
    bullet_points = None

  print(f'{title}\n{link}\n{displayed_link}\n{snippet}\n\nBullet points:\n{bullet_points}')


  if soup.select_one('#rso td:nth-child(1)') is None:
    pass
  else:
    print('\nTable:')
    for table_key, table_value in zip(
      soup.select('#rso td:nth-child(1)'), 
      soup.select('#rso td+ td')):

      key = table_key.text
      value = table_value.text
      print(f'{key}: {value}')


get_second_organic_result_answerbox()

----------
'''
Companies using IBM Watson and its marketshare - Enlyft
https://enlyft.com/tech/products/ibm-watson
https://enlyft.com › All Products › Machine Learning
The companies using IBM Watson are most often found in United States and in the Computer Software industry. IBM Watson is most often used by companies with 10-50 employees and 1M-10M dollars in revenue....Who uses IBM Watson?

Website: redcross.org
Country: United States
Revenue: >1000M
Company Size: >10000

# second layout
Types of Content Writing - DemandJump
https://www.demandjump.com/blog/types-of-content-writing
https://www.demandjump.com › blog › types-of-conten...
What are the types of content writing?

Bullet points:
Blogging. Creating blog posts is a staple of content writing. ... 
Copywriting. ... 
Technical Writing/Long Form. ... 
Social Media Posts. ... 
Emails.
SEO Content Writer. An SEO writer is one of the most important content writer types out there. ... 
Blog Content Writer. ... 
Long-Form Content Writer. ... 
Social Media Content Writer. ... 
Copywriter. ... 
Email Marketing Writer. ... 
Technical Writers. ... 
Press Release Writer.
'''

刮谷歌第三个有机答案框

import requests, lxml
from bs4 import BeautifulSoup

headers = {
  'User-agent':
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582'
}

params = {
    'q': 'How much is a large pizza at Papa Johns',
    'gl': 'us'
}

def get_third_organic_result_answerbox():
  html = requests.get('https://www.google.com/search', headers=headers, params=params)
  soup = BeautifulSoup(html.text, 'lxml')

  title = soup.select_one('.ifM9O .LC20lb').text
  link = soup.select_one('.ifM9O .yuRUbf a')['href']
  displayed_link = soup.select_one('.ifM9O .iUh30').text
  snippet = soup.select_one('.ifM9O .iKJnec').text

  print(title, link, displayed_link, snippet, sep='\n')

  for table_key, table_value, table_value_price in zip(
    soup.select('.ztXv9~ tr+ tr td:nth-child(1)'), 
    soup.select('td:nth-child(2)'),
    soup.select('td~ td+ td')):

    key = table_key.text
    value = table_value.text
    price = table_value_price.text

    print(f"{key}: {value}, {price}")


get_third_organic_result_answerbox()

--------------
'''
Papa John's - Fast Food Menu Prices
https://www.fastfoodmenuprices.com/papa-johns-prices/
https://www.fastfoodmenuprices.com › papa-johns-prices
Papa John's Menu Prices
Cheese (Original): Medium, $13.00
Cheese (Original): Large, $15.00
Cheese (Original): Extra Large, $17.00
'''

刮谷歌翻译答案框

import requests, lxml
from bs4 import BeautifulSoup

headers = {
  'User-agent':
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582'
}

params = {
    'q': 'hello in french',
    'gl': 'us'
}

def get_translation_answerbox():
  html = requests.get('https://www.google.com/search', headers=headers, params=params)
  soup = BeautifulSoup(html.text, 'lxml')

  detected_language = soup.select_one('.source-language').text
  text = soup.select_one('#tw-source-text-ta').text
  pronounce = soup.select_one('#tw-source-rmn .Y2IQFc').text

  target_language = soup.select_one('.target-language').text
  target_text = soup.select_one('#tw-target-text .Y2IQFc').text

  print(detected_language, text, pronounce, target_language, target_text, sep='\n')

  for result in soup.select('.DFDLnc'):
      title = result.next_sibling.select_one('.hrcAhc').text
      words = result.next_sibling.select_one('.MaH2Hf').text

      print(title, words, sep='\n')

get_translation_answerbox()

------------
'''
English - detected
hello
həˈlō
French
Bonjour

Bonjour!
Hello!, Hi!, Good morning!, Good afternoon!, How do you do?, Hallo!
Salut!
Hi!, Hello!, Salute!, All the best!, Hallo!, Hullo!
Tiens!
Hallo!, Hello!, Hullo!, Why!
Allô!
Hello!, Hullo!, Hallo!
'''

刮掉谷歌公式答案框

我发现目前有两种公式布局,这个和下面的一个。第一个很简单,第二个有点棘手。

import requests, lxml, numpy, time
from bs4 import BeautifulSoup
from selenium import webdriver

headers = {
  'User-agent':
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582'
}

params = {
    'q': 'formula of distance',
    'gl': 'us'
}

# example query: shorturl.at/euPX1
def get_formula_answerbox():
  html = requests.get('https://www.google.com/search', headers=headers, params=params)
  soup = BeautifulSoup(html.text, 'lxml')

  formula_img = soup.select_one('.IGEbUc')['src'].strip()
  formula = soup.select_one('.IGEbUc')['alt']
  print(formula, formula_img, sep='\n')

  temp_list = []
  clear_list = []

  for symbol, definition in zip(
    soup.select('img.ZFAGbf'),
    soup.select('.GQLwEe')):

    '''
    appending to temporary list:
    1. formula symbol name
    2. formula symbol (plain text)
    3. formula symbol image (svg)
    '''
    temp_list.append([definition.text, symbol['alt'], symbol['src']])

  # https://stackoverflow.com/a/45323085/15164646
  # multidimensional list -> flat list
  for item in list(numpy.array(temp_list).flat):
    if item not in clear_list:
      clear_list.append(item)

  for result in clear_list:
    print(result)


get_formula_answerbox()

--------------
'''
d = \sqrt{(x_2 - x_1)^2 + (y_2-y_1)^2}
https://www.gstatic.com/education/formulas2/397133473/en/distance_formula.svg
distance
d
https://www.gstatic.com/education/formulas2/397133473/en/distance_formula_d.svg
coordinates of the first point
(x_1, y_1)
https://www.gstatic.com/education/formulas2/397133473/en/distance_formula_x1y1.svg
coordinates of the second point
(x_2, y_2)
https://www.gstatic.com/education/formulas2/397133473/en/distance_formula_x2y2.svg
'''

第二种布局不同,因为在这样的布局中解析这样的公式(screenshot below)没有<img>选项来抓取,而是一个没有任何意义的纯文本。

为了使其有效,我们可以使用ZWZ100158 ZWZ100170 ZWZ100159 ZWZ100157通过ZWZ100171和ZWZ100161 ZWZ1100172 ZWZ172 ZWZ111100 ZWZ1001100 ZWZ100173 ZWZ100173 ZWZ173 ZWZ100100 ZWZ100100 ZWZ100100 ZWZ100100100100173

注意:使用selenium会减慢一切足够强大的速度。

# example URL: shorturl.at/bfvAU
def get_formula_solve_answerbox():
  html = requests.get('https://www.google.com/search', headers=headers, params=params)
  soup = BeautifulSoup(html.text, 'lxml')

  if soup.select_one('#HE4HM') is not None:
    title = soup.select_one('.vk_h').text
    solving = soup.select_one('.goog-menu-button-caption').text
    print(title, solving, sep='\n')

    for i in soup.select('.lr-fy-ol.Gi02q'):
        print(i.text)

  else: print('no formula found')


  # ----------------------------
  # Another way using Selenium

  driver = webdriver.Chrome(executable_path = 'path/to/chromedriver.exe')

    CURRENT_DATE = time.strftime("%d/%m/%y")

    queries = [
        'volume of a cylinder',
        'volume of a sphere',
        'volume of a cone',
        'volume of a cube',
        'volume of a pyramid',
        'sphere surface area',
        'cube surface area'
    ]

    for query in queries:

        current_time = time.strftime("%H:%M:%S")
        print(current_time)

        driver.get(f"https://www.google.com/search?q={query}&gl=us")

        plain_text_formula = driver.find_element_by_css_selector('.lr-fy-ol.Gi02q').text.replace('\n', '')
        print(plain_text_formula)

        driver.find_element_by_xpath('//*[@id="HE4HM"]/div/div[3]/div[2]').screenshot(f'formula_{CURRENT_DATE}_{current_time}.png')
        time.sleep(1.3)

    driver.quit()


get_formula_solve_answerbox()


----------
'''
# Bs4 output
Vπr2h
Va3
V43πr3

# Selenium output
V=πr2h
V=43πr3
V=πr2h3
V=a3
V=lwh3
A=4πr2
'''

Selenium截图:

还有一些实际截图的例子:

要使用mathpix转换图像,我们需要处理图像并将截图转换为Latex代码。

1.制作授权。

2.发出post请求并发送已拍摄的屏幕截图。

import requests
import json

r = requests.post("https://api.mathpix.com/v3/text",
    files={"file": open("YOUR_IMAGE_FILE.png","rb")},
    data={
      "options_json": json.dumps({
        "math_inline_delimiters": ["$", "$"],
        "rm_spaces": True
      })
    },
    headers={
        "app_id": "APP_ID",
        "app_key": "APP_KEY"
    }
)
print(json.dumps(r.json(), indent=4, sort_keys=True))

得到回应:

{
  "auto_rotate_confidence": 0.0046418966250847404,
  "auto_rotate_degrees": 0,
  "confidence": 0.9849104881286621,
  "confidence_rate": 0.9849104881286621,
  "is_handwritten": true,
  "is_printed": false,
  "latex_styled": "f(x)=\\left\\{\\begin{array}{ll}\nx^{2} & \\text { if } x<0 \\\\\n2 x & \\text { if } x \\geq 0\n\\end{array}\\right.",
  "request_id": "bd02af63ef187492c085331c60151d98",
  "text": "$f(x)=\\left\\{\\begin{array}{ll}x^{2} & \\text { if } x<0 \\\\ 2 x & \\text { if } x \\geq 0\\end{array}\\right.$"
}
...

查看其他可用的请求参数和格式描述。他们的API 定价。

注意:可能使用mathpix足以完成您的任务,但如果不是,请改用Wolfram Alpha,这是他们的专业领域。

抓取谷歌航班答案框

import requests, lxml
from bs4 import BeautifulSoup

headers = {
  'User-agent':
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582'
}

params = {
    'q': 'flight from london to new york',
    'gl': 'us'
}

def get_flights_answerbox():
  html = requests.get('https://www.google.com/search', headers=headers, params=params)
  soup = BeautifulSoup(html.text, 'lxml')

  title = soup.select_one('.N00E5e .mfMhoc').text.strip()
  print(f"{title}\n")

  for flight in soup.select('.ikUyY'):
    link = flight['href']
    airline = flight.select_one('.ps0VMc').text.strip()
    flight_duration = flight.select_one('.sRcB8').text.strip()
    flight_option = flight.select_one('.u85UCd').text.strip()
    price = flight.select_one('.xqqLDd').text.strip()

    print(f'{airline}\n{flight_duration}\n{flight_option}\n{price}\n{link}\n')


get_flights_answerbox()

-----------
'''
Flights from London, United Kingdom (all airports) to New York, NY (all airports)

Tap Air Portugal
12h 15m+
Connecting
from $327
https://www.google.com/flights?gl=us&source=flun&uitype=cuAA&hl=en&curr=USD&tfs=CAEQAhotEgoyMDIxLTExLTAxMgJUUGoMCAISCC9tLzA0anBscg0IAhIJL20vMDJfMjg2Gi0SCjIwMjEtMTEtMDgyAlRQag0IAhIJL20vMDJfMjg2cgwIAhIIL20vMDRqcGx6aENqUklXbkpCZDA1WmRFeENRbTlCUmpnMVRIZENSeTB0TFMwdExTMHRMVzkxYldjeE9FRkJRVUZCUjBaQ2NtaEZSMDU1VDBGQkVnTmpWRkFhQ3dpOC93RVFBaG9EVlZORU9BTnd2UDhC&sa=X&ved=2ahUKEwjXrLmSx4DzAhWQGs0KHZZVAQYQ1RUoAHoECBcQGg

JetBlue
8h 48m
Nonstop
from $402
https://www.google.com/flights?gl=us&source=flun&uitype=cuAA&hl=en&curr=USD&tfs=CAEQAhotEgoyMDIxLTExLTAxMgJCNmoMCAISCC9tLzA0anBscg0IAhIJL20vMDJfMjg2Gi0SCjIwMjEtMTEtMDgyAkI2ag0IAhIJL20vMDJfMjg2cgwIAhIIL20vMDRqcGx6aENqUklXbkpCZDA1WmRFeENRbTlCUmpnMVRIZENSeTB0TFMwdExTMHRMVzkxYldjeE9FRkJRVUZCUjBaQ2NtaEZSMDU1VDBGQkVnTnVRallhQ3dpSXVnSVFBaG9EVlZORU9BTndpTG9D&sa=X&ved=2ahUKEwjXrLmSx4DzAhWQGs0KHZZVAQYQ1RUoAXoECBcQGw
...
'''

刮掉谷歌匹配答案框

import requests, lxml
from bs4 import BeautifulSoup

headers = {
  'User-agent':
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582'
}

params = {
    'q': 'nfl matches',
    'gl': 'us'
}


''' 
Properly working while each team has scores, for example: 
Chiefs: 29
Browns: 29

But when only date and time is shown it won't scrape it, for example:
Today
20:10

It would only scrape team which HAVE scores but the scores attached to teams won't be correct. 
'''
def get_sport_matches_answerbox():
  html = requests.get('https://www.google.com/search', headers=headers, params=params)
  soup = BeautifulSoup(html.text, 'lxml')

  title = soup.select_one('.ofy7ae').text

  if soup.select_one('.mKwiob'):
    league = soup.select_one('.mKwiob').text
  else: league = 'Not mentioned'

  print(title, league, sep='\n')
  print()

  # zip() will get data in parallel but it will find all needed elements at once
  for first_team, second_team, first_score, second_score, status, match_date, video_highlight in zip(
    soup.select('.L5Kkcd+ .L5Kkcd span'),
    soup.select('.L5Kkcd:nth-child(5) span'),
    soup.select('.L5Kkcd:nth-child(5) .imspo_mt__t-sc .imspo_mt__tt-w'),
    soup.select('.imspo_mt__lt-t .imspo_mt__tt-w'),
    soup.select('.imspo_mt__match-status'),
    soup.select('.imspo_mt__ms-w div div :nth-child(1)'),
    soup.select('.BbrjBe')):

    match_status = status.text
    match_game_date = match_date.text
    match_video_highlight = video_highlight.select_one('a')['href']

    print(match_status, match_game_date, match_video_highlight, sep='\n')
    print(f"{first_team.text}: {first_score.text}")
    print(f"{second_team.text}: {second_score.text}")
    print()


get_sport_matches_answerbox()

----------
'''
NFL
Week 1 of 18

Final
Sun, Sep 12
https://www.youtube.com/watch?v=4g2R-kn_-0c&feature=onebox
Chiefs: 29
Browns: 29

Final
Sun, Sep 12
https://www.youtube.com/watch?v=dc_e6GdLzls&feature=onebox
Patriots: 17
Dolphins: 16
...
'''

刮谷歌地址答案框

import requests, lxml
from bs4 import BeautifulSoup

headers = {
  "User-agent":
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
  "q": "spotlight 29 casino address",
}

def get_address_answerbox():
  html = requests.get('https://www.google.com/search', headers=headers, params=params)
  soup = BeautifulSoup(html.text, 'lxml')

  address = soup.select_one('.sXLaOe').text
  print(address)

get_address_answerbox()

# 46-200 Harrison Pl, Coachella, CA 92236

使用 SerpApi 抓取 Google 答案框

SerpApi 是带有免费计划的付费 API。

不同之处在于,上面所做的一切以及更多操作默认情况下都已经存在,如果 HTML 布局中的某些内容将被更改并破坏解析器,则无需随着时间的推移维护解析器。

相反,唯一真正需要做的是迭代结构化 JSON 并快速获取您想要的数据。

所有_提到的布局的示例代码,除了_第二个公式布局:

from serpapi import GoogleSearch
import os, json

def serpapi_answerbox_example():
  params = {
    "api_key": os.getenv("API_KEY"),         # API key environment
    "engine": "google",                      # search engine
    "q": "flight from london to new york",   # query
    "location": "United States",             # from where search to originate
    "google_domain": "google.com",           # domain name
    "gl": "us",                              # location to search from
    "hl": "en"                               # language
  }

  search = GoogleSearch(params)
  results = search.get_dict()

  print(json.dumps(results['answer_box'], indent = 2, ensure_ascii = False))

-----------
'''
{
  "type": "google_flights",
  "title": "Flights from London, United Kingdom (all airports) to New York, NY (all airports)",
  "flights": [
    {
      "link": "https://www.google.com/flights?gl=us&hl=en&source=flun&uitype=cuAA&curr=USD&tfs=CAEQAhotEgoyMDIxLTExLTAxMgJUUGoMCAISCC9tLzA0anBscg0IAhIJL20vMDJfMjg2Gi0SCjIwMjEtMTEtMDgyAlRQag0IAhIJL20vMDJfMjg2cgwIAhIIL20vMDRqcGx6aENqUklZVzVMZWtNM1YzVXhhVFJCUmpObGJHZENSeTB0TFMwdExTMHRMUzF2ZFhSa04wRkJRVUZCUjBaRE0wOWpURzFVYjBGQkVnTmpWRkFhQ3dpOC93RVFBaG9EVlZORU9BTnd2UDhC&sa=X&ved=2ahUKEwir8vL554LzAhUVnWoFHbhSAnAQ1RUoAHoECBYQGg",
      "flight_info": [
        "Tap Air Portugal",
        "12h 15m+",
        "Connecting",
        "from $327"
      ]
    },
    {
      "link": "https://www.google.com/flights?gl=us&hl=en&source=flun&uitype=cuAA&curr=USD&tfs=CAEQAhotEgoyMDIxLTExLTAxMgJCNmoMCAISCC9tLzA0anBscg0IAhIJL20vMDJfMjg2Gi0SCjIwMjEtMTEtMDgyAkI2ag0IAhIJL20vMDJfMjg2cgwIAhIIL20vMDRqcGx6aENqUklZVzVMZWtNM1YzVXhhVFJCUmpObGJHZENSeTB0TFMwdExTMHRMUzF2ZFhSa04wRkJRVUZCUjBaRE0wOWpURzFVYjBGQkVnTnVRallhQ3dpSXVnSVFBaG9EVlZORU9BTndpTG9D&sa=X&ved=2ahUKEwir8vL554LzAhUVnWoFHbhSAnAQ1RUoAXoECBYQGw",
      "flight_info": [
        "JetBlue",
        "8h 48m",
        "Nonstop",
        "from $402"
      ]
    }
  ]
}
'''

友情链接

在线 IDE 中的代码•Google Direct Answer Box API•Mathpix•[Wolfram Alpha]zwz100

其他

如果您有任何问题或无法正常工作或有任何建议,请随时在评论部分发表评论或通过 Twitter 在@serp_api发表评论。如果您发现使用 SerpApi 的错误,请在SerpApi 论坛上添加此错误。

你的,

Dimitry 和 SerpApi 团队的其他成员。

Python

Python社区为您提供最前沿的新闻资讯和知识内容

更多推荐

求助！为什么用InsCode部署会出现无限重定向？

Python

如何重塑熊猫。系列

问题:如何重塑熊猫。系列在我看来,它就像 pandas.Series 中的一个错误。 a = pd.Series([1,2,3,4]) b = a.reshape(2,2) b b 有类型 Series 但无法显示,最后一条语句给出异常,非常冗长,最后一行是“TypeError: %d format: a number is required, not numpy.ndarray”。 b.sha

Python

在哪里可以找到有关 Keras 中默认权重初始化器的文档? [复制]

问题:在哪里可以找到有关 Keras 中默认权重初始化器的文档? [复制] 我刚刚在这里](https://keras.io/initializers/)中阅读了有关[中的 Keras 权重初始化器的信息。在文档中,只介绍了不同的初始化程序。如: model.add(Dense(64, kernel_initializer='random_normal')) 当我没有指定kernel_initia