使用 Python 和 SerpApi 抓取所有 Google Scholar 个人资料、作者结果到 CSV
🔨需要什么:了解循环、数据结构、异常处理。serpapi、pandas、urllib库。
⏱️需要多长时间:阅读和实施约 15-30 分钟。
-
会刮什么
-
先决条件
-
进程
*配置文件结果
*作者结果
*所有作者文章结果
*保存为 CSV
-
完整代码
-
链接
zwz 100010 zwz 100041 其他 zwz 100042 zwz 100040
会刮什么

先决条件
独立的虚拟环境
简而言之,它创建了一组独立的已安装库,包括可以在同一系统中相互共存的不同 Python 版本,从而防止库或 Python 版本冲突。
如果您之前没有使用过虚拟环境,请查看使用 Virtualenv 和 Poetry 我的](https://serpapi.com/blog/python-virtual-environments-using-virtualenv-and-poetry/)博客文章的专用[Python 虚拟环境教程以熟悉。
安装库:
pip install pandas, google-search-results
进程

如果不需要解释:
-
跳转到完整代码部分,
-
从 GitHub 存储库中获取完整代码,
-
在在线IDE中试一下。

抓取所有 Google Scholar 个人资料结果
import os
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
def profile_results():
print("Extracting profile results..")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_profiles", # profile results search engine
"mauthors": "blizzard", # search query
}
search = GoogleSearch(params)
profile_results_data = []
profiles_is_present = True
while profiles_is_present:
profile_results = search.get_dict()
for profile in profile_results.get("profiles", []):
print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')
thumbnail = profile.get("thumbnail")
name = profile.get("name")
link = profile.get("link")
author_id = profile.get("author_id")
affiliations = profile.get("affiliations")
email = profile.get("email")
cited_by = profile.get("cited_by")
interests = profile.get("interests")
profile_results_data.append({
"thumbnail": thumbnail,
"name": name,
"link": link,
"author_id": author_id,
"email": email,
"affiliations": affiliations,
"cited_by": cited_by,
"interests": interests
})
if "next" in profile_results["pagination"]:
search.params_dict.update(dict(parse_qsl(urlsplit(profile_results["pagination"]["next"]).query)))
else:
profiles_is_present = False
return profile_results_data
刮取所有profile结果解释
导入库:
import os
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
将搜索参数传递给 SerpApi 并创建一个临时list():
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_profiles", # profile results search engine
"mauthors": "blizzard", # search query
}
search = GoogleSearch(params)
profile_results_data = []
设置一个while循环,如果没有剩余页面,添加一个if语句退出while循环:
profiles_is_present = True
while profiles_is_present:
profile_results = search.get_dict()
# for loop extraction here..
# if next page in SerpApi pagination -> update params to new a page results.
# if no next page -> exit the while loop.
if "next" in profile_results.get("pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(profile_results.get("pagination").get("next")).query)))
else:
profiles_is_present = False
迭代配置文件会导致for循环:
for profile in profile_results.get("profiles", []):
print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')
thumbnail = profile.get("thumbnail")
name = profile.get("name")
link = profile.get("link")
author_id = profile.get("author_id")
affiliations = profile.get("affiliations")
email = profile.get("email")
cited_by = profile.get("cited_by")
interests = profile.get("interests")
将提取的数据作为字典附加到临时list和return中:
profile_results_data.append({
"thumbnail": thumbnail,
"name": name,
"link": link,
"author_id": author_id,
"email": email,
"affiliations": affiliations,
"cited_by": cited_by,
"interests": interests
return profile_results_data
# example output:
'''
Extracting profile results..
Currently extracting Adam Lobel with _xwYD2sAAAAJ ID.
... other profiles
[
{
"thumbnail": "https://scholar.googleusercontent.com/citations?view_op=small_photo&user=_xwYD2sAAAAJ&citpid=3",
"name": "Adam Lobel",
"link": "https://scholar.google.com/citations?hl=en&user=_xwYD2sAAAAJ",
"author_id": "_xwYD2sAAAAJ",
"email": "Verified email at AdamLobel.com",
"affiliations": "Blizzard Entertainment",
"cited_by": 2935,
"interests": [
{
"title": "Gaming",
"serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Agaming",
"link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:gaming"
},
{
"title": "Emotion regulation",
"serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Aemotion_regulation",
"link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:emotion_regulation"
}
]
},
... other profiles
]
'''

抓取 Google Scholar 作者结果
import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
def author_results():
print("extracting author results..")
author_results_data = []
for author_id in profile_results():
print(f"Parsing {author_id['author_id']} author ID.")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_author", # author results search engine
"author_id": author_id["author_id"], # search query
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
thumbnail = results.get("author").get("thumbnail")
name = results.get("author").get("name")
affiliations = results.get("author").get("affiliations")
email = results.get("author").get("email")
website = results.get("author").get("website")
interests = results.get("author").get("interests")
cited_by_table = results.get("cited_by", {}).get("table")
cited_by_graph = results.get("cited_by", {}).get("graph")
public_access_link = results.get("public_access", {}).get("link")
available_public_access = results.get("public_access", {}).get("available")
not_available_public_access = results.get("public_access", {}).get("not_available")
co_authors = results.get("co_authors")
author_results_data.append({
"thumbnail": thumbnail,
"name": name,
"affiliations": affiliations,
"email": email,
"website": website,
"interests": interests,
"cited_by_table": cited_by_table,
"cited_by_graph": cited_by_graph,
"public_access_link": public_access_link,
"available_public_access": available_public_access,
"not_available_public_access": not_available_public_access,
"co_authors": co_authors
})
return author_results_data
刮痧作者结果说明
导入profile_results()函数等库:
import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
profile_results()将遍历所有可用页面并返回包含作者 ID 结果的字典,例如_xwYD2sAAAAJ。
创建临时list来存储提取的数据:
author_results_data = []
遍历提取的配置文件,将author_id传递给author_id搜索参数:
for author_id in profile_results():
print(f"Parsing {author_id['author_id']} author ID.")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_author", # author results search engine
"author_id": author_id["author_id"], # search query: _xwYD2sAAAAJ
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
提取数据:
thumbnail = results.get("author").get("thumbnail")
name = results.get("author").get("name")
affiliations = results.get("author").get("affiliations")
email = results.get("author").get("email")
website = results.get("author").get("website")
interests = results.get("author").get("interests")
cited_by_table = results.get("cited_by", {}).get("table")
cited_by_graph = results.get("cited_by", {}).get("graph")
public_access_link = results.get("public_access", {}).get("link")
available_public_access = results.get("public_access", {}).get("available")
not_available_public_access = results.get("public_access", {}).get("not_available")
co_authors = results.get("co_authors")
将提取的数据作为字典附加到临时list和return中:
author_results_data.append({
"thumbnail": thumbnail,
"name": name,
"affiliations": affiliations,
"email": email,
"website": website,
"interests": interests,
"cited_by_table": cited_by_table,
"cited_by_graph": cited_by_graph,
"public_access_link": public_access_link,
"available_public_access": available_public_access,
"not_available_public_access": not_available_public_access,
"co_authors": co_authors
})
return author_results_data
# example output:
'''
extracting author results..
Extracting profile results..
Currently extracting Adam Lobel with _xwYD2sAAAAJ ID.
... other authors
Parsing _xwYD2sAAAAJ author ID.
... other authors
[
{
"thumbnail": "https://scholar.googleusercontent.com/citations?view_op=view_photo&user=_xwYD2sAAAAJ&citpid=3",
"name": "Adam Lobel",
"affiliations": "Blizzard Entertainment",
"email": "Verified email at AdamLobel.com",
"website": "https://twitter.com/GrowingUpGaming",
"interests": [
{
"title": "Gaming",
"link": "https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:gaming",
"serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Agaming"
},
{
"title": "Emotion regulation",
"link": "https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:emotion_regulation",
"serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Aemotion_regulation"
}
],
"cited_by_table": [
{
"citations": {
"all": 2935,
"since_2017": 2348
}
},
{
"h_index": {
"all": 10,
"since_2017": 10
}
},
{
"i10_index": {
"all": 11,
"since_2017": 10
}
}
],
"cited_by_graph": [
{
"year": 2014,
"citations": 70
},
{
"year": 2015,
"citations": 188
},
{
"year": 2016,
"citations": 243
},
{
"year": 2017,
"citations": 342
},
{
"year": 2018,
"citations": 420
},
{
"year": 2019,
"citations": 553
},
{
"year": 2020,
"citations": 507
},
{
"year": 2021,
"citations": 504
},
{
"year": 2022,
"citations": 16
}
],
"public_access_link": "https://scholar.google.com/citations?view_op=list_mandates&hl=en&user=_xwYD2sAAAAJ",
"available_public_access": 1,
"not_available_public_access": 0,
"co_authors": [
{
"name": "Isabela Granic",
"link": "https://scholar.google.com/citations?user=4T5cjVIAAAAJ&hl=en",
"serpapi_link": "https://serpapi.com/search.json?author_id=4T5cjVIAAAAJ&engine=google_scholar_author&hl=en",
"author_id": "4T5cjVIAAAAJ",
"affiliations": "Radboud University Nijmegen",
"email": "Verified email at pwo.ru.nl",
"thumbnail": "https://scholar.googleusercontent.com/citations?view_op=small_photo&user=4T5cjVIAAAAJ&citpid=4"
},
... other co-authors
}
]
}
... other authors
]
'''
从 Google Scholar 中删除所有作者文章
import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
def all_author_articles():
author_article_results_data = []
for index, author_id in enumerate(profile_results(), start=1):
print(f"Parsing {index} author with {author_id['author_id']} author ID.")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_author", # author results search engine
"hl": "en", # language
"sort": "pubdate", # sort by year
"author_id": author_id["author_id"] # search query
}
search = GoogleSearch(params)
articles_is_present = True
while articles_is_present:
results = search.get_dict()
for article in results.get("articles", []):
title = article.get("title")
link = article.get("link")
citation_id = article.get("citation_id")
authors = article.get("authors")
publication = article.get("publication")
cited_by_value = article.get("cited_by", {}).get("value")
cited_by_link = article.get("cited_by", {}).get("link")
cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
year = article.get("year")
author_article_results_data.append({
"article_title": title,
"article_link": link,
"article_year": year,
"article_citation_id": citation_id,
"article_authors": authors,
"article_publication": publication,
"article_cited_by_value": cited_by_value,
"article_cited_by_link": cited_by_link,
"article_cited_by_cites_id": cited_by_cites_id,
})
if "next" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
else:
articles_is_present = False
return author_article_results_data
抓取所有作者文章解释
导入profile_results()函数等库:
import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
在这种情况下,profile_results()也用于获取author_id,以解析作者文章。
创建临时list来存储提取的数据:
author_article_results_data = []
遍历profile_results()并将author_id传递给参数搜索查询:
for index, author_id in enumerate(profile_results(), start=1):
print(f"Parsing {index} author with {author_id['author_id']} author ID.")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_author", # author results search engine
"hl": "en", # language
"sort": "pubdate", # sort by year
"author_id": author_id["author_id"] # search query
}
search = GoogleSearch(params)
设置一个while循环并检查if下一页是否存在:
articles_is_present = True
while articles_is_present:
results = search.get_dict()
# data extraction code..
# if next page is present -> update previous results to new page results.
# if next page is not present -> exit the while loop.
if "next" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
else:
articles_is_present = False
在for循环中提取数据:
for article in results.get("articles", []):
title = article.get("title")
link = article.get("link")
citation_id = article.get("citation_id")
authors = article.get("authors")
publication = article.get("publication")
cited_by_value = article.get("cited_by", {}).get("value")
cited_by_link = article.get("cited_by", {}).get("link")
cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
year = article.get("year")
Append将数据提取到临时list作为字典:
author_article_results_data.append({
"article_title": title,
"article_link": link,
"article_year": year,
"article_citation_id": citation_id,
"article_authors": authors,
"article_publication": publication,
"article_cited_by_value": cited_by_value,
"article_cited_by_link": cited_by_link,
"article_cited_by_cites_id": cited_by_cites_id,
})
Return提取数据:
return author_article_results_data

将 Google Scholar 个人资料和作者结果保存到 CSV
from google_scholar_profile_results import profile_results
import pandas as pd
def save_profile_results_to_csv():
print("Waiting for profile results to save..")
pd.DataFrame(data=profile_results()).to_csv("google_scholar_profile_results.csv", encoding="utf-8", index=False)
print("Profile Results Saved.")
def save_author_result_to_csv():
print("Waiting for author results to save..")
pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_results.csv", encoding="utf-8", index=False)
print("Author Results Saved.")
def save_author_articles_to_csv():
print("Waiting for author articles to save..")
pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_articles.csv", encoding="utf-8", index=False)
print("Author Articles Saved.")
-
DataFrame中的data参数是您的数据。 -
encoding='utf-8'参数只是为了确保所有内容都将正确保存。我明确地使用了它,甚至认为它是一个默认值。 -
index=False参数删除默认pandas行号。
完整代码
import os
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
def profile_results():
print("Extracting profile results..")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_profiles", # profile results search engine
"mauthors": "blizzard", # search query
}
search = GoogleSearch(params)
profile_results_data = []
profiles_is_present = True
while profiles_is_present:
profile_results = search.get_dict()
for profile in profile_results.get("profiles", []):
print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')
thumbnail = profile.get("thumbnail")
name = profile.get("name")
link = profile.get("link")
author_id = profile.get("author_id")
affiliations = profile.get("affiliations")
email = profile.get("email")
cited_by = profile.get("cited_by")
interests = profile.get("interests")
profile_results_data.append({
"thumbnail": thumbnail,
"name": name,
"link": link,
"author_id": author_id,
"email": email,
"affiliations": affiliations,
"cited_by": cited_by,
"interests": interests
})
if "next" in profile_results.get("pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(profile_results.get("pagination").get("next")).query)))
else:
profiles_is_present = False
return profile_results_data
def author_results():
print("extracting author results..")
author_results_data = []
for author_id in profile_results():
print(f"Parsing {author_id['author_id']} author ID.")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_author", # author results search engine
"author_id": author_id["author_id"], # search query
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
thumbnail = results.get("author").get("thumbnail")
name = results.get("author").get("name")
affiliations = results.get("author").get("affiliations")
email = results.get("author").get("email")
website = results.get("author").get("website")
interests = results.get("author").get("interests")
cited_by_table = results.get("cited_by", {}).get("table")
cited_by_graph = results.get("cited_by", {}).get("graph")
public_access_link = results.get("public_access", {}).get("link")
available_public_access = results.get("public_access", {}).get("available")
not_available_public_access = results.get("public_access", {}).get("not_available")
co_authors = results.get("co_authors")
author_results_data.append({
"thumbnail": thumbnail,
"name": name,
"affiliations": affiliations,
"email": email,
"website": website,
"interests": interests,
"cited_by_table": cited_by_table,
"cited_by_graph": cited_by_graph,
"public_access_link": public_access_link,
"available_public_access": available_public_access,
"not_available_public_access": not_available_public_access,
"co_authors": co_authors
})
return author_results_data
def all_author_articles():
author_article_results_data = []
for index, author_id in enumerate(profile_results(), start=1):
print(f"Parsing author #{index} with {author_id['author_id']} author ID.")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_author", # author results search engine
"hl": "en", # language
"sort": "pubdate", # sort by year
"author_id": author_id["author_id"] # search query
}
search = GoogleSearch(params)
articles_is_present = True
while articles_is_present:
results = search.get_dict()
for article in results.get("articles", []):
title = article.get("title")
link = article.get("link")
citation_id = article.get("citation_id")
authors = article.get("authors")
publication = article.get("publication")
cited_by_value = article.get("cited_by", {}).get("value")
cited_by_link = article.get("cited_by", {}).get("link")
cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
year = article.get("year")
author_article_results_data.append({
"article_title": title,
"article_link": link,
"article_year": year,
"article_citation_id": citation_id,
"article_authors": authors,
"article_publication": publication,
"article_cited_by_value": cited_by_value,
"article_cited_by_link": cited_by_link,
"article_cited_by_cites_id": cited_by_cites_id,
})
if "next" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
else:
articles_is_present = False
return author_article_results_data
def save_author_result_to_csv():
print("Waiting for author results to save..")
pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_results.csv", encoding="utf-8", index=False)
print("Author Results Saved.")
def save_author_articles_to_csv():
print("Waiting for author articles to save..")
pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_articles.csv", encoding="utf-8", index=False)
print("Author Articles Saved.")
def save_profile_results_to_csv():
print("Waiting for profile results to save..")
pd.DataFrame(data=profile_results()).to_csv("google_scholar_profile_results.csv", encoding="utf-8", index=False)
print("Profile Results Saved.")
友情链接
-
GitHub存储库
-
在线IDE中的代码
-
Google Scholar Profiles API
-
谷歌学术作者 API
其他
如果您的目标是在不需要从头开始编写解析器的情况下提取数据,请弄清楚如何绕过搜索引擎的块,如何对其进行缩放或如何从 JavaScript 中提取数据 -试试 SerpApi。
如果您有任何要分享的内容、任何问题、建议或无法正常工作的内容,请随时在评论部分发表评论或通过 Twitter 与@dimitryzub或[@serp_api]联系(https://twitter.com/serp_api).
加入我们Reddit|推特|YouTube
添加功能请求💫 或错误🐞
更多推荐

所有评论(0)