(毕设1)爬虫+mysql+flask+echarts实现网站数据可视化(附源码)
目录1. 项目要求与内容2.数据爬取2.1分析url,网页源码2.2编写代码2.3 数据清洗3.数据存储3.1mysql中需要建立的6张表3.2 建表语句3.3将2中清洗后的数据通过navicat导入4.flask web网站的搭建4.1 使用pycharm建一个flask项目4.2 后端4.3 前端4.4项目框架5.项目展示6.源码点击1. 项目要求与内容利用python爬取数据并进行清洗和预处
·
目录
1. 项目要求与内容
利用python爬取数据并进行清洗和预处理,将清洗后的数据存到mysql数据库中,后端利用flask框架,用Echarts实现数据可视化。
1.2完成项目基本内容:
爬取豆瓣读书top250网页上相关信息;
对爬取保存下来的数据文件进行清洗和预处理;
将清洗好的数据导入数据库;
进行需求分析,对要实现的数据可视化效果进行分析,创建相应的数据库表并导入数据;
选用python语言进行后端管理:
2. 开发工具及相关技术
pycharm
navicat
mysql
2.数据爬取
2.1分析url,网页源码
2.2编写代码
# 1. 导入库包
import requests
from lxml import etree
from time import sleep
import os
import pandas as pd
import re
BOOKS = []
IMGURLS = []
# 2. 获取网页源代码
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
# 异常处理
try:
html = requests.get(url, headers=headers)
# 声明编码方式
html.encoding = html.apparent_encoding
# 判断
if html.status_code == 200:
print('成功获取源代码')
# print(html.text)
except Exception as e:
print('获取源代码失败:%s' % e)
# 返回html
return html.text
# 3. 解析网页源代码
def parse_html(html):
html = etree.HTML(html)
# 每个图书信息分别保存在 class="indent" 的div下的 table标签内
tables = html.xpath("//div[@class='indent']//table")
# print(len(tables)) # 打印之后如果是25的话就是对的
books = []
imgUrls = []
# 遍历通过xpath得到的li标签列表
# 因为要获取标题文本,所以xpath表达式要追加 /text(), t.xpath返回的是一个列表,且列表中只有一个元素所以追加一个[0]
for t in tables:
# title = t.xpath(".//div[@class='p12']/a/@title") # 匹配得到的是空的
# 书名
title = t.xpath(".//td[@valign='top']//a/@title")[0]
# 链接
link = t.xpath(".//td[@valign='top']//a/@href")[0]
# 获取pl标签的字符串
pl = t.xpath(".//td[@valign='top']//p[1]/text()")[0]
# 截取国家
if '[' in pl:
country = pl.split('[')[1].split(']')[0]
else:
country = '中' # 没有国家的默认为“中国”
# 截取作者
if '[' in pl:
author = pl.split(']')[1].split('/')[0].replace(" ", "")
elif len(pl.split('/')) == 3:
author = '无'
elif len(pl.split('/')) == 2:
author = pl.split('/')[0]
elif '[' not in pl:
if len(pl.split('/')) == 4:
author = pl.split('/')[-4]
elif len(pl.split('/')) == 5:
author = pl.split('/')[-5]
elif len(pl.split('/')) == 6:
author = pl.split('/')[-6]
else:
author = '无'
# 截取翻译者
if len(pl.split('/')) == 3:
translator = ' '
elif '[' in pl:
if len(pl.split('/')) == 4:
translator = pl.split('/')[-3]
elif len(pl.split('/')) == 5:
translator = pl.split('/')[-4]
elif len(pl.split('/')) == 6:
translator = pl.split('/')[-5]
else:
translator = ' '
# 截取出版社
if len(pl.split('/')) == 2:
publisher=pl.split('/')[0]
elif len(pl.split('/'))== 3:
publisher = pl.split('/')[0]
elif '[' in pl:
if len(pl.split('/'))== 4:
publisher = pl.split('/')[1]
elif len(pl.split('/')) == 5:
publisher = pl.split('/')[2]
elif len(pl.split('/')) == 6:
publisher = pl.split('/')[-3]
elif len(pl.split('/')) == 7:
publisher = pl.split('/')[-4]
elif '[' not in pl:
#if len(pl.split('/'))== 3:
publisher = pl.split('/')[-3]
# if len(pl.split('/')) == 6:
# publisher = pl.split('/')[-3]
# elif len(pl.split('/')) == 7:
# publisher = pl.split('/')[-4]
# 截取出版时间
if len(pl.split('/')) == 2:
time= '不详'
elif len(pl.split('/')) == 4:
time = pl.split('/')[-2]
elif len(pl.split('/')) == 5:
time = pl.split('/')[-2]
elif len(pl.split('/')) == 6:
time = pl.split('/')[-2]
# 截取单价
if '元' in pl:
price = pl.split('/')[-1].split('元')[0]
else:
price = pl.split('/')[-1]
# 获取星级数
str1 = t.xpath(".//td[@valign='top']//div[@class='star clearfix']/span[1]/@class")[0].replace("allstar", "")
# 此时获取到的数字其实是字符串类型,不能直接%10,需要把str转化为int
num = int(str1)
star = num / 10
# 获取评分
score = t.xpath(".//td[@valign='top']//div[@class='star clearfix']/span[2]/text()")[0]
# 获取评价人数
pnum = t.xpath(".//td[@valign='top']//div[@class='star clearfix']/span[3]/text()")[0]
people = re.sub("\D", "", pnum)
# 获取简介
comments = t.xpath(".//p[@class='quote']/span/text()")
comment = comments[0] if len(comments) != 0 else "无"
book = {
'书名': title,
'链接': link,
'国家': country,
'作者': author,
'翻译者': translator,
'出版社': publisher,
'出版时间': time,
'价格': price,
'星级': star,
'评分': score,
'评价人数': people,
'简介': comment
}
# 图片
imgUrl = t.xpath(".//a/img/@src")[0]
# print(imgUrl)
books.append(book)
imgUrls.append(imgUrl)
return books, imgUrls
# 4. 下载图片保存文件
def downloadimg(url, book):
# 判断文件夹是否在指定路径下面,建立文件夹并把指定路径移到文件夹下面
if 'img' in os.listdir(r'D:\pachong'):
pass
else:
os.mkdir(r'D:\pachong\img')
os.chdir(r'D:\pachong\img')
# 返回img的二进制流
img = requests.request('GET', url).content
with open(book['书名'] + '.jpg', 'wb') as f:
# print('正在下载: %s' % url)
f.write(img)
# 5. 数据预处理
# def processData():
if __name__ == '__main__':
# url = 'https://book.douban.com/top250?start=0'
# 10页循环遍历
for i in range(10):
# 2. 定义url并获取网页源代码
url = 'https://book.douban.com/top250?start={}'.format(i * 25)
# print(url)
html = get_html(url)
# 3. 解析网页源代码
sleep(1)
books = parse_html(html)[0]
imgUrls = parse_html(html)[1]
BOOKS.extend(books)
IMGURLS.extend(imgUrls)
# 4. 下载图片保存文件
# for i in range(250):
# # sleep(1)
# downloadimg(IMGURLS[i], BOOKS[i])
os.chdir(r'D:/pachong/img')
# 以csv格式写入本地
bookdata = pd.DataFrame(BOOKS)
bookdata.to_csv('D:/pachong/book.csv', index=False)
print("图书信息写入本地成功")
# 以txt格式写入本地错误
# 得到的是字典格式,要想写成txt格式需要先转化成字符串格式
# for i in range(25):
# with open('book.txt', 'a') as f:
# f.write(books[i] + '\n')
2.3 数据清洗
#数据清洗
import pandas as pd
import re
import time
# 先读取数据文件
data = pd.read_csv('D:/pachong/book.csv')
result = pd.DataFrame(data)
a = result.dropna(axis=0, how='any')
pd.set_option('display.max_rows', None) #输出全部行,不省略
b = u'数据'
number = 1
b1 = '1981-8'
li1 = a['出版社']
for i in range(0, len(li1)):
try:
if b1 in li1[i]:
# print(number,li1[i])
number += 1
a = a.drop(i, axis=0)
except:
pass
b2 = '中国基督'
a['出版时间'] = a['出版时间'].str[0: 5]
li2 = a['出版时间']
for i in range(0, len(li2)):
try:
if b2 in li2[i]:
# print(number,li2[i])
number += 1
a = a.drop(i, axis=0)
except:
pass
b3 = 'CNY'
li3 = a['价格']
for i in range(0, len(li3)):
try:
if b3 in li3[i]:
a['价格'] = li3.str.replace('CNY', '')
except:
pass
b41 = '清'
b42 = '明'
li4 = a['国家']
a['国家'] = li4.str.replace("国", "")
for i in range(0, len(li4)):
try:
if b41 in li4[i]:
a['国家'] = li4.str.replace('清', '中')
if b42 in li4[i]:
a['国家'] = li4.str.replace('明', '中')
except:
pass
time.sleep(3)
a.to_csv('newbook.csv', index=False)
3.数据存储
3.1 mysql中需要建立的6张表
books | 原始数据表 |
---|---|
book_country_num | 出版国家 |
book_publisher_num | 出版社表 |
book_presstime_num | 出版时间表 |
book_people_title | 评论人数top10数据表 |
book_score_num | 评分数据表 |
3.2 建表语句
CREATE TABLE `books` (
`title` varchar(255) DEFAULT NULL,
`link` varchar(255) DEFAULT NULL,
`country` varchar(255) DEFAULT NULL,
`author` varchar(255) DEFAULT NULL,
`translator` varchar(255) DEFAULT NULL,
`publisher` varchar(255) DEFAULT NULL,
`press_time` int(11) DEFAULT NULL,
`price` decimal(10,2) DEFAULT NULL,
`star` float DEFAULT NULL,
`score` float DEFAULT NULL,
`people` int(11) DEFAULT NULL,
`comment` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE `book_country_num` (
`country` varchar(255) DEFAULT NULL COMMENT '国家',
`num` int(11) DEFAULT NULL COMMENT '数量'
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
insert into book_country_num
select
country, count(*) as num
from books
group by country;
CREATE TABLE `book_publisher_num` (
`publisher` varchar(255) DEFAULT NULL COMMENT '出版社',
`num` int(11) DEFAULT NULL COMMENT '数量'
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
insert into book_publisher_num
select
publisher, count(*) as num
from books
group by publisher;
CREATE TABLE `book_presstime_num` (
`press_time` int(11) DEFAULT NULL COMMENT '出版时间',
`num` int(11) DEFAULT NULL COMMENT '数量'
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
insert into book_presstime_num
select
press_time, count(*) as num
from books
group by press_time;
CREATE TABLE `book_people_title` (
`people` int(11) DEFAULT NULL COMMENT '评论人数',
`title` varchar(255) DEFAULT NULL COMMENT '书名'
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
insert into book_people_title
select
people, title
from books
group by people
order by people desc
limit 10;
CREATE TABLE `book_score_num` (
`score` float DEFAULT NULL COMMENT '评分',
`num` int(11) DEFAULT NULL COMMENT '数量'
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
insert into book_score_num
select
score, count(*) as num
from books
group by score;
3.3将2中清洗后的数据通过navicat导入
4.flask web网站的搭建
4.1 使用pycharm建一个flask项目
4.2 后端
from flask import Flask,render_template
import pymysql
app = Flask(__name__)
@app.route('/')
def index():
return render_template("index.html")
@app.route('/index')
def home():
#return render_template("index.html")
return index()
@app.route('/movie')
def movie():
datalist = []
con = pymysql.connect(
host='192.168.43.100',
port=3306,
user='root',
passwd='Root@123',
db='doubanbook',
charset='utf8'
)
cur = con.cursor()
sql = "select * from books"
data = cur.execute(sql)
result=cur.fetchall()
for item in result:
datalist.append(item)
cur.close()
cur.close()
print(datalist)
return render_template("movie.html", movies=datalist)
@app.route('/score')
def score():
score = [] #评分
num = [] #每个评分所统计出的电影数量
conn = pymysql.Connect(
host='192.168.43.100',
port=3306,
user='root',
passwd='Root@123',
db='doubanbook',
charset='utf8'
)
cur = conn.cursor()
sql = "select * from book_score_num"
data = cur.execute(sql)
result = cur.fetchall()
for item in result:
score.append(str(item[0]))
num.append(item[1])
cur.close()
conn.close()
return render_template("score.html",score= score,num=num)
@app.route('/country')
def country():
country = [] #评分
num = [] #每个评分所统计出的电影数量
conn = pymysql.Connect(
host='192.168.43.100',
port=3306,
user='root',
passwd='Root@123',
db='doubanbook',
charset='utf8'
)
cur = conn.cursor()
sql = "select * from book_country_num"
data = cur.execute(sql)
result = cur.fetchall()
for item in result:
country.append(str(item[0]))
num.append(item[1])
cur.close()
conn.close()
return render_template("country.html",country=country,num=num)
@app.route('/peopletop10')
def peopletop10():
people = [] #评论人数
title = [] #书名
s=[]
conn = pymysql.Connect(
host='192.168.43.100',
port=3306,
user='root',
passwd='Root@123',
db='doubanbook',
charset='utf8'
)
cur = conn.cursor()
sql = "select * from book_people_title"
data = cur.execute(sql)
result = cur.fetchall()
for item in result:
s.append(item)
people.append(str(item[0]))
title.append(item[1])
cur.close()
conn.close()
return render_template("peopletop10.html", people=people,title=title)
@app.route('/presstime')
def presstime():
year = []
num = []
s=[]
conn = pymysql.Connect(
host='192.168.43.100',
port=3306,
user='root',
passwd='Root@123',
db='doubanbook',
charset='utf8'
)
cur = conn.cursor()
sql = "select * from book_presstime_num"
data = cur.execute(sql)
result = cur.fetchall()
for item in result:
s.append(item)
year.append(str(item[0]))
num.append(item[1])
cur.close()
conn.close()
return render_template("presstime.html", year=year,num=num)
@app.route('/publisher')
def publisher():
year = []
num = []
s=[]
conn = pymysql.Connect(
host='192.168.43.100',
port=3306,
user='root',
passwd='Root@123',
db='doubanbook',
charset='utf8'
)
cur = conn.cursor()
sql = "select * from book_publisher_num"
data = cur.execute(sql)
result = cur.fetchall()
for item in result:
s.append(item)
year.append(str(item[0]))
num.append(item[1])
cur.close()
conn.close()
return render_template("publisher.html", year=year,num=num)
@app.route('/word')
def word():
return render_template("word.html")
@app.route('/team')
def team():
return render_template("team.html")
if __name__ == '__main__':
app.run()
4.3 前端
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta content="width=device-width, initial-scale=1.0" name="viewport">
<title>豆瓣图书Top250数据分析</title>
<meta content="" name="descriptison">
<meta content="" name="keywords">
<!-- Favicons -->
<link href="static/assets/img/favicon.png" rel="icon">
<link href="static/assets/img/apple-touch-icon.png" rel="apple-touch-icon">
<!-- Google Fonts -->
<link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,600,600i,700,700i,900" rel="stylesheet">
<!-- Vendor CSS Files -->
<link href="static/assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="static/assets/vendor/icofont/icofont.min.css" rel="stylesheet">
<link href="static/assets/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
<link href="static/assets/vendor/animate.css/animate.min.css" rel="stylesheet">
<link href="static/assets/vendor/venobox/venobox.css" rel="stylesheet">
<link href="static/assets/vendor/aos/aos.css" rel="stylesheet">
<!-- Template Main CSS File -->
<link href="static/assets/css/style.css" rel="stylesheet">
</head>
<body>
<!-- ======= Header ======= -->
<header id="header">
<div class="container">
<div class="logo float-left">
<h1 class="text-light"><a href="temp.html"><span>孙浩</span></a></h1>
<!-- Uncomment below if you prefer to use an image logo -->
<!-- <a href="temp.html"><img src="static/assets/img/logo.png" alt="" class="img-fluid"></a>-->
</div>
<nav class="nav-menu float-right d-none d-lg-block">
<ul>
<li class="active"><a href="/index">首页<i class="la la-angle-down"></i></a></li>
<li><a href="/movie">电影</a></li>
<li><a href="/country">国家</a></li>
<li><a href="/score">评分</a></li>
<li><a href="/peopletop10">评论人数top10</a></li>
<li><a href="/presstime">出版年份</a></li>
<li><a href="/publisher">出版社</a></li>
<li><a href="/word">词云</a></li>
<li><a href="/team">团队</a></li>
</ul>
</nav><!-- .nav-menu -->
</div>
</header><!-- End Header -->
<!-- ======= Our Team Section ======= -->
<section id="team" class="team">
<div class="container">
<div class="section-title">
<h2>豆瓣电影Top250电影</h2>
</div>
<!-- ======= Counts Section ======= -->
<section class="counts section-bg">
<div class="container">
<table class="table table-striped">
<tr>
<td>书名</td>
<td>作者</td>
<td>译者</td>
<td>出版社</td>
<td>出版时间</td>
<td>价格</td>
<td>星级</td>
<td>评分</td>
<td>评价人数</td>
<td>一句话概述</td>
</tr>
{% for movie in movies %}
<tr>
<td>
<a href="{{ movie[1] }}" target="_blank">
{{movie[0]}}
</a>
</td>
<td>{{ movie[3] }}</td>
<td>{{movie[4]}}</td>
<td>{{movie[5]}}</td>
<td>{{movie[6]}}</td>
<td>{{movie[7]}}</td>
<td>{{movie[8]}}</td>
<td>{{movie[9]}}</td>
<td>{{movie[10]}}</td>
<td>{{movie[11]}}</td>
<td>{{movie[12]}}</td>
</tr>
{% endfor %}
</table>
</div>
</div>
</section><!-- End Counts Section -->
</div>
</section><!-- End Our Team Section -->
<!-- ======= Footer ======= -->
<footer id="footer">
<div class="container">
<div class="copyright">
© Copyright <strong><span>孙浩</span></strong>. All Rights Reserved
</div>
</div>
</footer><!-- End Footer -->
<a href="#" class="back-to-top"><i class="icofont-simple-up"></i></a>
<!-- Vendor JS Files -->
<script src="static/assets/vendor/jquery/jquery.min.js"></script>
<script src="static/assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
<script src="static/assets/vendor/jquery.easing/jquery.easing.min.js"></script>
<script src="static/assets/vendor/php-email-form/validate.js"></script>
<script src="static/assets/vendor/jquery-sticky/jquery.sticky.js"></script>
<script src="static/assets/vendor/venobox/venobox.min.js"></script>
<script src="static/assets/vendor/waypoints/jquery.waypoints.min.js"></script>
<script src="static/assets/vendor/counterup/counterup.min.js"></script>
<script src="static/assets/vendor/isotope-layout/isotope.pkgd.min.js"></script>
<script src="static/assets/vendor/aos/aos.js"></script>
<!-- Template Main JS File -->
<script src="static/assets/js/main.js"></script>
</body>
</html>
因其余前端页面类似,故只展示一个
4.4项目框架
5.项目展示
6.源码 点击
更多推荐
已为社区贡献1条内容
所有评论(0)