引言

应用 Java 的开源库,编写一个搜索引擎,这个引擎能爬取一个网站的内容。并根据网页内容进行深度爬取,获取所有相关的网页地址和内容,用户可以通过关键词,搜索所有相关的网址。

具体功能

(1) 用户可以指定爬取一个url对应的网页的内容。

(2) 对网页内容进行解析,并获取其中所有的url链接地址。

(3) 用户可以设定爬取深度,代表着从初始url对应的页面开始,可以爬取其中所有的url对应的网页内的url,以此类推。深度越大,能爬取到的网站越多。

(4) 对爬取到的url内容进行保存、建立索引。建立索引的内容是url地址本身,和url对应的网页标题。

(5) 用户可以通过关键词对网址进行搜索,找出有该关键词的url地址。

(6) 建立索引和搜索索引的过程能智能识别中文关键词,能对关键词进行分词操作。

(7) 用户可以指定保存索引的地址、初始url、爬取深度、进行搜索的关键词和最大匹配项。

开源框架

Lucene

Jsoup

源码

爬虫部分:Spider.java

package webCrawler.Spider;

import java.io.IOException;

import java.util.ArrayList;

import java.util.HashSet;

import java.util.Scanner;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import webCrawler.Index.BuildIndex;

/**

* @author lannooo

*/

public class Spider {

ArrayList URLs;

private String startURL;

private int digLevel;

/**

* @param startURL 爬虫的起始URL

* @param digLevel 爬取深度

*/

public Spider(String startURL, int digLevel){

this.startURL = startURL;

this.digLevel = digLevel;

this.URLs = new ArrayList<>();

}

/**

* @param level 当前爬取的深度剩余

* @param arrayList 需要进行下一轮爬去的URL集

* @return 从一格url集爬取到的新的URL集

* @throws IOException

*/

public ArrayList getLevelURLs(int level, ArrayList arrayList)

throws IOException{

ArrayList total = null;

if(level>0){

total = new ArrayList<>();

for(String url: arrayList){

/*对于每个arrayList中的URL,首先解析其网页内容,并获得里面所有URL项*/

for(String each: getBareLinks(url)){

total.add(each);

}

}

/*用HashSet这个容器将total里面重复项删除*/

HashSet hashSet = new HashSet<>(total);

total = new ArrayList<>(hashSet);

}

return total;

}

/**

* 从startURL开始,爬取所有相关URLs

* @throws IOException

*/

public void getAll() throws IOException{

ArrayList newURLs;

ArrayList currentURLs = new ArrayList<>();

/*把startURL加入currentURLs这个列表中,从这个url开始爬*/

currentURLs.add(startURL);

for(int i=digLevel; i>0; i--){

/*

* 对于每一层,都要获取一次由这个url引申出去的url集

* 然后把当前集的已经爬去过的url加入到总的URL集中

* 最后newURLs作为新的需要进行深度爬取的集进入下一轮循环

*/

System.out.println("Dig into level: " + (digLevel-i+1));

newURLs = getLevelURLs(i, currentURLs);

for(String each: currentURLs){

URLs.add(each);

}

currentURLs = newURLs;

}

for(String each:currentURLs){

URLs.add(each);

}

HashSet hashSet = new HashSet<>(URLs);

URLs = new ArrayList<>(hashSet);

}

/**

* @param path 保存索引的路径

* @throws IOException

*/

public void storeURLsAndInfo(String path) throws IOException{

BuildIndex build = new BuildIndex(path);

/* 把URLs中的所有url进行实际网页标题的爬取*/

for(String each:URLs){

String text = getLinkText(each);

if(text!=null){

build.addField("url", each);

build.addField("text", text);

/*将这一个entry加入索引中*/

build.pushIndex();

}

}

build.close();

}

/**

* @param url 需要获取网页标题的url

* @return 标题内容

* @throws IOException

*/

public String getLinkText(String url) throws IOException{

Document document = null;

try {

/*用Jsoup进行连接,设置超时时间为3秒*/

document = Jsoup.connect(url).timeout(3000).get();

} catch (Exception e) {

System.out.println("[TIMEOUT]Get title of url:"+url);

return null;

}

String title = document.title();

return title;

}

/**

* @param url 进行内容解析的url

* @return 返回该url的网页内容内的所有urls列表

* @throws IOException

*/

public ArrayList getBareLinks(String url) throws IOException{

ArrayList linksList = new ArrayList<>();

Document document;

try {

document = Jsoup.connect(url).timeout(2000).get();

} catch (Exception e) {

return linksList;

}

/*获取

标签理的所有带href属性的 标签*/

Elements links = document.select("body").select("a[href]");

for(Element link: links){

/*从每一个解析得到的

String href = link.attr("abs:href").replaceAll("#", "");

/*只添加含有zju.edu.cn字符的url,去除末尾的'/'*/

if(href.contains("zju.edu.cn")){

if (href.endsWith("/")){

href = href.substring(0, href.length()-1);

}

linksList.add(href);

}

}

HashSet hashSet = new HashSet<>(linksList);

ArrayList arrayList = new ArrayList<>(hashSet);

return arrayList;

}

public static void main(String[] args) {

Scanner in = new Scanner(System.in);

System.out.println("Enter url:");

String url = in.nextLine().trim();

while(!url.startsWith("http://")){

System.out.println("http:// is needed!");

System.out.println("Enter url:");

url = in.nextLine().trim();

}

System.out.println("Enter depth to dig more urls[<=3 recommended]:");

int depth = in.nextInt();

Spider spider = new Spider(url, depth);

System.out.println("Enter path you want to save[default=d:/index-spider]:");

String path = in.nextLine().trim();

if(path.length()==0){

path = "d:/index-spider";

}

try {

System.out.println("Start fetching...");

spider.getAll();

System.out.println("Urls got success!");

spider.storeURLsAndInfo(path);

System.out.println("Stored success!");

} catch (IOException e) {

e.printStackTrace();

}

}

}

建立索引:BuildIndex.java

package webCrawler.Index;

import java.io.*;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.wltea.analyzer.lucene.IKAnalyzer;

/**

* @author lannooo

*

*/

public class BuildIndex {

private File file;

private Directory directory;

private IndexWriter indexWriter;

private IndexWriterConfig config;

private Analyzer analyzer;

private Document document;

/**

* @param path 建立索引的路径

*/

public BuildIndex(String path) {

try {

file = new File(path);

directory = FSDirectory.open(file);

document = new Document();

analyzer = new IKAnalyzer(); /*中文分词工具类*/

config = new IndexWriterConfig(Version.LUCENE_4_10_0, analyzer);

indexWriter = new IndexWriter(directory, config);

} catch (Exception e) {

e.printStackTrace();

}

}

/**

* @param fieldName 加入到document中的新的一项的名称

* @param fieldText 新的一项的内容

*/

public void addField(String fieldName, String fieldText){

try{

Field field = new TextField(fieldName, fieldText, Field.Store.YES);

document.add(field);

}catch (Exception e) {

e.printStackTrace();

}

}

/**

* 将document加入到索引中

*/

public void pushIndex(){

try {

indexWriter.addDocument(document);

document = new Document();

} catch (Exception e) {

e.printStackTrace();

}

}

/**

* 加入完整的一个document并保存到索引中

* @param url 加入的url地址

* @param text url对应的文本

*/

public void addOneIndex(String url, String text){

this.addField("url", url);

this.addField("text", text);

this.pushIndex();

}

/**

* 关闭索引写入

*/

public void close(){

try {

indexWriter.close();

} catch (Exception e) {

e.printStackTrace();

}

}

}

搜索索引

package webCrawler.Index;

import java.io.File;

import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.FSDirectory;

import org.wltea.analyzer.lucene.IKAnalyzer;

/**

* @author lannooo

*

*/

public class SearchIndex {

private IndexSearcher indexSearcher;

private Analyzer analyzer;

private QueryParser parser;

private Query query;

private TopDocs hits;

private DirectoryReader reader;

/**

* @param path 进行索引搜索的路径

*/

public SearchIndex(String path){

try {

reader = DirectoryReader.open(FSDirectory.open(new File(path)));

indexSearcher = new IndexSearcher(reader);

analyzer = new IKAnalyzer();

} catch (Exception e) {

e.printStackTrace();

}

}

/**

* @param fieldName 搜索的域名称

* @param text 搜索的内容

* @param matchNumber 最大匹配项数

* @return 搜索到的最大匹配数

*/

public int search(String fieldName, String text, int matchNumber){

try {

parser = new QueryParser(fieldName, analyzer);

query = parser.parse(text);

hits = indexSearcher.search(query, matchNumber);

return hits.totalHits;

} catch (Exception e) {

e.printStackTrace();

}

return -1;

}

/**

* 打印所有的匹配项

*/

public void printHits(){

try{

System.out.println("Total hits number:"+hits.totalHits);

for(ScoreDoc doc: hits.scoreDocs){

Document document = indexSearcher.doc(doc.doc);

System.out.println(document.get("url"));

System.out.println(document.get("text"));

}

reader.close();

}catch (Exception e) {

e.printStackTrace();

}

}

public static void main(String[] args) {

/*输入关键词*/

Scanner in = new Scanner(System.in);

System.out.println("Enter path of the index:");

String path = in.nextLine().trim();

while(path.length()==0){

System.out.println("Enter path of the index:");

path = in.nextLine().trim();

}

System.out.println("Enter max hit number:");

int max = in.nextInt();

while(max<0){

System.out.println("Enter max hit number:");

max = in.nextInt();

}

in.nextLine();

System.out.print("Search>>> ");

String text = in.nextLine().trim();

/*循环读入用户的关键词,如果是q则退出,长度为0也退出*/

while(!text.equals("q")){

if(text.length()>0){

SearchIndex search = new SearchIndex(path);

int hits = search.search("text", text, max);

if(hits!=-1){

search.printHits();

}

}

System.out.print("Search>>> ");

text = in.nextLine().trim();

}

}

}

UI界面(这里为了方便只是命令行的形式,可以根据需求写一个GUI界面)

package webCrawler.UI;

import java.util.Scanner;

import webCrawler.Index.SearchIndex;

/**

* @author lannooo

*

*/

public class UI {

public static void main(String[] args) {

/*输入关键词*/

Scanner in = new Scanner(System.in);

System.out.print("Search>>> ");

String text = in.nextLine().trim();

/*对于用户的关键词,如果是q则退出,长度为0也退出*/

while(!text.equals("q") && text.length()>0){

SearchIndex search = new SearchIndex("d:/index-spider2");

int hits = search.search("text", text, 20);

if(hits!=-1){

search.printHits();

}

System.out.print("Search>>> ");

text = in.nextLine().trim();

}

}

}

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持脚本之家。

Logo

瓜分20万奖金 获得内推名额 丰厚实物奖励 易参与易上手

更多推荐