由于工作需要,要抓取谷歌app市场的的icon,花了点时间粗略的看了下python基本搞出来了,如下:


#!/usr/bin/env python
#-*- encoding: utf8 -*-
# author : xxx
# version: 1.1.0
# Date   : 2015/09/07 09:30:00
# 功能   : 定期在Google Play抓取游戏软件最新版本信息

# 版本号标签 <div class="content" itemprop="softwareVersion"> 1.0.0  </div>
# 图片标签 <img class="cover-image" src="https://lh3.googleusercontent.com/68uKqs4VBQ5Sl2f7kqGmy1sLYzezmAn_LrV993b4Vw6vn5gAYRk0mGhqC3ZnLnzeU0A=w300-rw" alt="Cover art" aria-hidden="true" itemprop="image">



import time
import urllib2
import HTMLParser
import re
import os
import sys
import xlrd
from GifImagePlugin import getdata




#excel文件 游戏列表文件路径 
xls_filename = ur"D:\需抓取icon资源信息.xlsx"
#请求的根url  
host = "https://play.google.com/store/apps/details?hl=zh_CN&id="
#本地保存地址  
localSavePath = 'D:\\icon\\'
#log文件目录
logPath = r"<span style="font-family: Arial, Helvetica, sans-serif;">D:\\icon\\</span><span style="font-family: Arial, Helvetica, sans-serif;">"</span>



#xls解析
def analyXls(filename):
    print "analy excel file start ... "
    
    #app列表model
    modelList = [];

    data = xlrd.open_workbook(xls_filename)
    sheetnames = data.sheet_names()
    #sheet数量
    count = len(data.sheets())
    
    for sheet_name in sheetnames:
        print sheet_name
        #单个的工作表
        sheet = data.sheet_by_name(sheet_name)
        #总行数
        rows = sheet.nrows
        #总列数
        cols = sheet.ncols
    
        #for row in range(20001,22000):
        for row in range(3,rows):
            if cols < 1 :
                break
            
            resId = str(int(sheet.cell_value(row,1))).strip()
            resType = str(int(sheet.cell_value(row,2))).strip()
            resName = sheet.cell_value(row,3)
            pkgName = str(sheet.cell_value(row,4)).strip()
            pkgVersion = str(sheet.cell_value(row,5)).strip()
            
            model = XlsModel(resId=resId,resType=resType,resName=resName,pkgName=pkgName,pkgVersion=pkgVersion)
            modelList.append(model)
            
            #print len(modelList)
            #if row > 20 :
            #    break
            
    print "analy excel file end ... "     
       
    return  modelList   

   
#获取网页            
def getDoc(url):
    try:
        response = None
        
        response = urllib2.urlopen(url, timeout=6)
        html = response.read()
        print "获取应用成功:[Code:%s] %s" %(response.code,url)
        return html
    except urllib2.HTTPError, e:
        print "HTTPError------"
        print "获取应用失败:[Code:%s]" % e.code
        print url
        if str(e.code) != "404" :
            raise Exception("非404错误,访问url网络异常",e)
        return None
    except urllib2.URLError, e:
        print "URLError------"
        print "获取应用失败:[Code:%s]" % e.reason
        print url
        #vpn断了  连不上网了
        if "timed out" == str(e.reason):
            return e.reason
        else:
            raise Exception("未知错误,访问url网络异常",e)
            return None
    except Exception, e:
        print "Exception------"
        print e
        raise Exception("非404错误,访问url网络异常",e)
        return None
    finally:
        if response:
            response.close()
#版本号   
def getVersion(doc):
    reg = r'<div class="content" itemprop="softwareVersion">(.+?)</div>'
    match = re.search(reg, doc)
    if match:
        return match.group(1).strip().decode()
    return ""



#图片的url  
def getIconUrl(doc):
    reg = r'<img class="cover-image" src="(.+?)" alt="Cover art" aria-hidden="true" itemprop="image">'
    match = re.search(reg, doc)
    if match:
        return match.group(1).strip()
    return None


#下载icon
def downloadImage(url,xlsModel):  

    print "icon url ...",url
    response = urllib2.urlopen(url);
    #图片类型  即为后缀
    imgType = response.info().getheader("Content-Type")
    ext = imgType[imgType.find('/')+1:]
    if ext == "jpeg" or ext == "jpg" or ext == "JPEG" or ext == "JPG" :
        print "原类型为:" + ext + "  转为:png"
        ext = "png"
    imgType = "." + ext
    
    filepath = localSavePath + xlsModel.getFilePath()
    if not os.path.exists(filepath) :
        os.makedirs(filepath)
     
    filename = filepath + os.sep + xlsModel.getFileName() + imgType
    #print filename
    
    f = open(filename,'wb+')  
    cont = response.read()  
    f.write(cont)  
    f.close() 

    print "ok 文件下载成功: %s"%(filename)


#比较版本号大小 a = "v 1.2.7 a"  b = "a1.2.18 a"
def eqSize(v1,v2):
    '''
        1: v1 > v2
        2: v2 > v1
        3: v1 == v2
        4: 无法比较  类似 v1.1 1.2 
    '''
    try:
        rs = 0
    
        iv1 = v1.replace(" ", "").lower()
        iv2 = v2.replace(" ", "").lower()
        
        if (iv1 == "" and iv2 =="") or (iv1=="." and iv2=="."):
            rs = 3
            return rs
        if iv1 =="" or  iv2 =="" or iv1=="." or iv2==".":
            rs = 4
            return rs
        
        iv1list = []
        iv2list = []
        
        reg1 ="(^[A-Za-z.]+)"
        reg2 ="(^[0-9.]+)"
        
        while iv1 != "" :
            match1 = re.search(reg1, iv1)
            if match1 :
                m =  match1.group(1).strip().decode()
                #iv1 = iv1.replace(m, "")
                iv1 = iv1[(len(m)):]
                
                m = m.replace(".", "").replace(" ", "")
                map = {"k":"match1","v":m}
                iv1list.append(map)
            else :
                match2 = re.search(reg2, iv1)
                if match2 :
                    m =  match2.group(1).strip().decode()
                    #iv1 = iv1.replace(m, "")
                    iv1 = iv1[(len(m)):]

                    reg4 ="([.]+$)"
                    match4 = re.search(reg4, m)
                    if match4 :
                        rm = match4.group(0).strip().decode()
                        m = m[:(len(m)-len(rm))]
                    reg3 ="([.0]+$)"
                    
                    match3 = re.search(reg3, m)
                    while match3 :
                        rm = match3.group(0).strip().decode()
                        m =  m[:(len(m)-len(rm))]
                        match3 = re.search(reg3, m)
                        
                    ivls = m.split(".")
                    for i in ivls :
                        map = {"k":"match2","v":i}
                        iv1list.append(map)

        while iv2 != "" :
            match1 = re.search(reg1, iv2)
            if match1 :
                m =  match1.group(1).strip().decode()
                #iv2 = iv2.replace(m, "")
                iv2 = iv2[(len(m)):]
                m = m.replace(".", "").replace(" ", "")
                map = {"k":"match1","v":m}
                iv2list.append(map)
            else :
                match2 = re.search(reg2, iv2)
                if match2 :
                    m =  match2.group(1).strip().decode()
                    #iv2 = iv2.replace(m, "")
                    iv2 = iv2[(len(m)):]

                    reg4 ="([.]+$)"
                    match4 = re.search(reg4, m)
                    if match4 :
                        rm = match4.group(0).strip().decode()
                        m = m[:(len(m)-len(rm))]
                    reg3 ="([.0]+$)"
                    match3 = re.search(reg3, m)
                    while match3 :
                        rm = match3.group(0).strip().decode()
                        m =  m[:(len(m)-len(rm))]
                        match3 = re.search(reg3, m)
                            
                    iv2s = m.split(".")
                    for i in iv2s :
                        map = {"k":"match2","v":i}
                        iv2list.append(map)
        
        iv1list_len = len(iv1list)
        iv2list_len = len(iv2list)
        minlen = 0
        if iv1list_len > iv2list_len :
            minlen = iv2list_len
            rs = 1
        elif iv1list_len < iv2list_len :
            minlen = iv1list_len
            rs = 2
        else:
            minlen = iv1list_len
            rs = 3
        
        for i in range(0,minlen):
            iv1v_k = iv1list[i]["k"]
            iv1v_v = iv1list[i]["v"]
            iv2v_k = iv2list[i]["k"]
            iv2v_v = iv2list[i]["v"]
            
            if iv1v_k != iv2v_k :
                rs = 4
                break
            
            if iv1v_k == "match2" and  iv2v_k == "match2" :
                
                #0开头的比较
                reg0 ="(^[0])"
                match1 = re.search(reg0, iv1v_v)
                match2 = re.search(reg0, iv2v_v)
                len1 = len(iv1v_v)
                len2 = len(iv2v_v)
                if match1 and len1>1:
                    iv1v_v= "0." + iv1v_v[1:]
                if match2 and len2>1:
                    iv2v_v= "0." + iv2v_v[1:]
                    #0开头的比较
                    
                #位数不足的 补位
                reg1 ="([.])"
                match1 = re.search(reg1, iv1v_v)
                match2 = re.search(reg1, iv2v_v)
                if match1 or match2:
                    pass
                else :
                    if len1 > len2 :
                        iv2v_v = iv2v_v + ("0" * (len1-len2))
                    if len2 > len1 :
                        iv1v_v = iv1v_v + ("0" * (len2-len1))
                    #位数不足的 补位
                    
                if float(iv1v_v) > float(iv2v_v) :
                    rs = 1
                    break
                elif float(iv1v_v) < float(iv2v_v) :
                    rs = 2
                    break
                else:
                    continue
            else:
                if iv1v_v > iv2v_v :
                    rs = 1
                    break
                elif iv1v_v < iv2v_v :
                    rs = 2
                    break
                else:
                    continue
        return rs
    except Exception, e:
        print "比较大小Exception------"
        print e
        rs = 4
        return rs





def writeLog(modelList):
    ''' 
        0 : 未处理
        1 :谷歌市场找不到的应用
        2 :请求版本 低于谷歌市场版本
        3 : 请求版本 高于谷歌市场版本
        4 : 更新成功
        5 : 异常、错误失败的app
        6 : 版本无法比较
    '''
    log = Log(Log.allApp)
    
    print
    log.writeLog("--------谷歌市场找不到的应用-------")
    notfoundApp = [i for i in modelList if i.status == 1]
    log.writeLog("总数: " + str(len(notfoundApp)))
    for model in notfoundApp :
        log.writeLog(model.resId + "    " + model.pkgName )
    
    print
    log.writeLog("--------请求版本,低于谷歌市场版本-------" )
    lowApp = [i for i in modelList if i.status == 2]
    log.writeLog("总数: " + str(len(lowApp)) )
    for model in lowApp :
        log.writeLog(model.resId + "    " + model.pkgName +"    "+ model.pkgVersion +"    "+ model.google_pkgVersion )
    
    print
    log.writeLog("--------请求版本,高于谷歌市场版本-------" )
    highApp = [i for i in modelList if i.status == 3]
    log.writeLog("总数: " + str(len(highApp)) )
    for model in highApp :
        log.writeLog(model.resId +  "    " + model.pkgName +"    "+ model.pkgVersion +"    "+ model.google_pkgVersion )
    
    
    print
    log.writeLog("--------更新成功-------" )
    successApp = [i for i in modelList if i.status == 4]
    log.writeLog("总数: " + str(len(successApp)) )
    for model in successApp :
        log.writeLog(model.resId + "    " + model.pkgName +"    "+ model.pkgVersion +"    "+ model.google_pkgVersion)
    
        
    print
    log.writeLog("--------异常、错误失败的应用-------" )
    errorApp = [i for i in modelList if i.status == 5]
    log.writeLog("总数: " + str(len(errorApp)) )
    for model in errorApp :
        log.writeLog(model.resId + "    " + model.pkgName )
    
    print
    log.writeLog("--------版本无法比较的应用-------" )
    errorApp = [i for i in modelList if i.status == 6]
    log.writeLog("总数: " + str(len(errorApp)) )
    for model in errorApp :
        log.writeLog(model.resId + "    " + model.pkgName +"    "+ model.pkgVersion +"    "+ model.google_pkgVersion)   
        
    log.close()   


#日志
class Log :
    notfoundApp = "notfoundApp"
    lowApp = "lowApp"
    highApp = "highApp"
    successApp = "successApp"
    errorApp = "errorApp"
    errorVersionApp = "errorVersionApp"
    allApp = "allApp"
        
    def __init__(self,name):
        if not os.path.exists(logPath) :
            os.makedirs(logPath)
        now = time.strftime('%Y%m%d%H%M%S');
        filename = logPath + os.sep + name + now + ".log"
        f = open(filename,"wb+")
        
        self.f = f
        
    
    def writeLog(self,str):
        print str
        self.f.write(str + os.linesep)
        self.f.flush()
    
    def writeLog2(self,model):
        if model.status == 1:
            self.writeLog3(model.status,model.resId + "    " + model.pkgName )
        elif model.status == 2:
            self.writeLog3(model.status,model.resId + "    " + model.pkgName +"    "+ model.pkgVersion +"  <  "+ model.google_pkgVersion )
        elif model.status == 3:
            self.writeLog3(model.status,model.resId +  "    " + model.pkgName +"    "+ model.pkgVersion +"  >  "+ model.google_pkgVersion )
        elif model.status == 4:
            self.writeLog3(model.status,model.resId + "    " + model.pkgName +"    "+ model.pkgVersion +"  >  "+ model.google_pkgVersion)
        elif model.status == 5:
            self.writeLog3(model.status,model.resId + "    " + model.pkgName +"    "+  str(model.ex))
        elif model.status == 6:
            self.writeLog3(model.status,model.resId + "    " + model.pkgName +"    "+ model.pkgVersion +"  !=  "+ model.google_pkgVersion )
            
    def writeLog3(self,status,str):
        if status == 1:
            print "谷歌市场找不到的应用 :" + str
            self.f.write(str + os.linesep)
            self.f.flush()
        elif status == 2:
            print "请求版本 低于谷歌市场版本:" + str
            self.f.write(str +  os.linesep)
            self.f.flush()
        elif status == 3:
            print "请求版本 高于谷歌市场版本 :" + str
            self.f.write(str +  os.linesep)
            self.f.flush()
        elif status == 4:
            print "更新成功 :" + str
            self.f.write(str +  os.linesep)
            self.f.flush()
        elif status == 5:
            print " 异常、错误失败的app :" + str
            self.f.write(str +  os.linesep)
            self.f.flush()
        elif status == 6:
            print "版本无法比较 :" + str
            self.f.write(str +  os.linesep)
            self.f.flush()
        
    def close(self):
        self.f.close()
        
        
        

#excel模型
class XlsModel :
    
    def __init__(self,resId,resType,resName,pkgName,pkgVersion):
        self.resId = resId
        self.resType = resType
        self.resName = resName
        self.pkgName = pkgName
        self.pkgVersion = pkgVersion
        ''' 
            0 : 未处理
            1 :谷歌市场找不到的应用
            2 :请求版本 低于谷歌市场版本
            3 : 请求版本 高于谷歌市场版本
            4 : 更新成功
            5 : 异常  错误失败的app
            6 : 版本无法比较
            
            10 : 未处理
         '''
        self.status = 10
        
        self.google_pkgVersion = ""
    
    def __str__(self):
        return "resId" + " : " + self.resId + " " \
            + "resType" + " : " + self.resType + " "  \
            + "resName" + " : " + self.resName + " "  \
            + "pkgName" + " : " + self.pkgName + " "  \
            + "pkgVersion" + " : " + self.pkgVersion
            
    #计算文件名       
    def getFileName(self):
        return "icon-google"
        
    #计算文件名       
    def getFilePath(self):
        #resId%1000/resType-resId/icon-google.png
        path1 = int(self.resId) % 1000 
        path2 = self.resType + "-" + self.resId
        
        return str(path1) + os.sep + path2
        
    #网络访问路径名       
    def getNetworkPath(self):
        return host + self.pkgName


#执行爬取数据
def main():
    
    modelList = analyXls(xls_filename)
    models = modelList[:]
    
    notfoundApp = Log(Log.notfoundApp)
    lowApp= Log(Log.lowApp)
    highApp=Log(Log.highApp)
    successApp=Log(Log.successApp)
    errorApp=Log(Log.errorApp)
    errorVersionApp = Log(Log.errorVersionApp)
    
    inum = 0
    
    slen =len(modelList)
    
    #循环
    while True :
        
        
        inum = inum + 1
        count = len(models)
        exeCount = 0
    
        if count == 0 :
            break
        
        
        for model in models :
            exeCount = exeCount + 1
            try:
                point = time.time()
                print 
                print 
                print 
                #print "------->>start fetch document"
                doc = getDoc(model.getNetworkPath())
                while str(doc) == "timed out" :
                    doc = getDoc(model.getNetworkPath())
                    print "vpn链接断开 或其他问题导致连不上谷歌市场...    %s  %s" %(model.resId,model.pkgName)
                    
                if doc is None:
                    model.status = 1
                    notfoundApp.writeLog2(model)
                    continue
                else:
                    #print "------->>start analysis document"
                    model.google_pkgVersion = getVersion(doc)
                    
                    if model.google_pkgVersion != model.pkgVersion :
                        #正则匹配无法识别比较的版本号
                        p = re.compile('^[A-Za-z0-9. ]+$',re.S)
                        match1 = p.match(model.google_pkgVersion)
                        match2 = p.match(model.pkgVersion)
                        if match1 and match2 :
                                       
                            #比较版本大小
                            rs = eqSize(model.google_pkgVersion, model.pkgVersion)
                            if rs == 1 :                        
                                model.status = 2
                                lowApp.writeLog2(model)
                            elif rs == 2 :
                                model.status = 3
                                highApp.writeLog2(model)
                            elif rs == 3 :
                                downloadImage(getIconUrl(doc),model)
                                model.status = 4
                                successApp.writeLog2(model)
                            elif rs == 4 :
                                model.status = 6
                                errorVersionApp.writeLog2(model)
                            
                        else:
                            model.status = 6
                            errorVersionApp.writeLog2(model)
                    else :
                        downloadImage(getIconUrl(doc),model)
                        model.status = 4
                        successApp.writeLog2(model) 
            except Exception,ex:
                model.status = 5
                model.ex = ex
                errorApp.writeLog2(model)
            print "gameid:%s %s used %s sec" % (model.resName,model.pkgName, time.time()-point)
            print "num %s count: %s  exeCount: %s 剩余:%s " %(inum,count,exeCount,(count-exeCount))
    
        models = [i for i in modelList if i.status == 0]
        models = [i for i in modelList if i.status == 5]
        
#         if slen > len(models) or len(models) == len(modelList) :
#             slen = len(models)
#             writeLog(modelList)
        
    notfoundApp.close()
    lowApp.close()
    highApp.close()
    successApp.close()
    errorApp.close()
    errorVersionApp.close()
    
    writeLog(modelList)

 
if __name__ == "__main__":
                 
    print '>>BEGIN<<'
    start = time.time()
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start))
    main()
    end = time.time()
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end))
    print "#########get_icon_for_googleapp_by_id_and_version_0.py over. Used %s" % (end - start)
    print '>>END<<'  









Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐