org.apache.tika
tika-core
1.12
org.apache.tika
tika-parsers
1.12
com.bbossgroups.plugins
bboss-elasticsearch-rest-jdbc
5.5.7
com.hankcs
hanlp
portable-1.7.1
org.apache.httpcomponents
httpclient
4.5.5
注意:与spring时需要注意版本号,版本太高会造成jar包冲突,tika-parsers 依赖poi.jar包,所以项目中不需要单独添加poi.jar,会造成冲突。
完整的项目elasticsearch-common
pom.xml内容
xsi:schemalocation="http://maven.apache.org/pom/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4.0.0
com.hd
elasticsearch-common
1.0-snapshot
war
elasticsearch-common maven webapp
http://www.example.com
utf-8
1.7
1.7
5.1.40
1.0.29
4.2.3.release
3.0.1
2.8.8
2.5
2.8.2
5.3.5.final
4.3.11.final
1.3.2
2.6.11
junit
junit
4.11
test
javax.el
javax.el-api
3.0.0
test
org.glassfish
javax.el
3.0.0
test
javax.servlet
javax.servlet-api
${servlet.version}
provided
javax.servlet
jsp-api
2.0
provided
javax.servlet
jstl
1.2
org.apache.logging.log4j
log4j-core
${log4j2.version}
org.apache.logging.log4j
log4j-jcl
${log4j2.version}
org.apache.logging.log4j
log4j-slf4j-impl
${log4j2.version}
org.springframework
spring-core
org.springframework
spring-context
org.springframework
spring-beans
org.springframework
spring-expression
org.springframework
spring-jdbc
org.springframework
spring-orm
${spring.version}
org.springframework
spring-tx
org.springframework
spring-aop
org.springframework
spring-web
org.springframework
spring-webmvc
org.springframework
spring-test
org.springframework
spring-aspects
org.springframework
spring-context-support
commons-io
commons-io
${commons-io.version}
commons-fileupload
commons-fileupload
1.3.1
org.hibernate
hibernate-core
${hibernate.version}
mysql
mysql-connector-java
${mysql.version}
com.alibaba
druid
${druid.version}
com.fasterxml.jackson.core
jackson-databind
${jackson.version}
com.alibaba
fastjson
1.2.54
org.apache.httpcomponents
httpclient
4.5.5
org.apache.tika
tika-core
1.12
org.apache.tika
tika-parsers
1.12
com.bbossgroups.plugins
bboss-elasticsearch-rest-jdbc
5.5.7
com.hankcs
hanlp
portable-1.7.1
org.apache.shiro
shiro-spring
${shiro.version}
slf4j-api
org.slf4j
org.hibernate
hibernate-validator
${hibernate-validator.version}
net.sf.ehcache
ehcache-core
${ehcache.version}
com.googlecode.ehcache-spring-annotations
ehcache-spring-annotations
1.2.0
elasticsearch-common
org.apache.maven.plugins
maven-compiler-plugin
3.5.1
${maven.compiler.source}
${maven.compiler.target}
${project.build.sourceencoding}
org.apache.maven.plugins
maven-surefire-plugin
2.4.2
true
org.eclipse.jetty
jetty-maven-plugin
9.3.10.v20160621
9967
stop
0
8878
/
org.apache.tomcat.maven
tomcat7-maven-plugin
2.2
8878
/
utf-8
tomcat7
aliyun
aliyun
http://maven.aliyun.com/nexus/content/groups/public
org.springframework
spring-framework-bom
${spring.version}
pom
import
elasticsearch.properties文件内容
#elasticuser=elastic
#elasticpassword=hzhh123
elasticsearch.rest.hostnames=127.0.0.1:9200
#elasticsearch.rest.hostnames=192.168.200.82:9200,192.168.200.83:9200,192.168.200.85:9200
elasticsearch.dateformat=yyyy.mm.dd
elasticsearch.timezone=asia/shanghai
elasticsearch.ttl=2d
#在控制台输出脚本调试开关showtemplate,false关闭,true打开,同时log4j至少是info级别
elasticsearch.showtemplate=true
#elasticsearch.discoverhost=true
http.timeoutconnection = 400000
http.timeoutsocket = 400000
http.connectionrequesttimeout=400000
http.retrytime = 1
http.maxlinelength = -1
http.maxheadercount = 200
http.maxtotal = 400
http.defaultmaxperroute = 200
elasticsearch.xml
class="org.frameworkset.elasticsearch.elasticsearch"
init-method="configure"
destroy-method="stop"
f:elasticsearchpropes="attr:elasticsearchpropes"/>
httpclient.xml
f:timeoutconnection = "${http.timeoutconnection}"
f:timeoutsocket = "${http.timeoutsocket}"
f:connectionrequesttimeout="${http.connectionrequesttimeout}"
f:retrytime = "${http.retrytime}"
f:maxlinelength = "${http.maxlinelength}"
f:maxheadercount = "${http.maxheadercount}"
f:maxtotal = "${http.maxtotal}"
f:defaultmaxperroute = "${http.defaultmaxperroute}"
class="org.frameworkset.spi.remote.http.clientconfiguration">
search.xml
"settings": {
"number_of_shards": 6,
"index.refresh_interval": "5s"
},
"mappings": {
"document": {
"properties": {
"title": {
"type": "text",
"analyzer": "ik_max_word"
},
"contentbody": {
"type": "text",
"analyzer": "ik_max_word"
},
"fileid": {
"type": "text"
},
"description": {
"type": "text",
"analyzer": "ik_max_word"
},
"tags": {
"type": "text"
},
"typeid": {
"type": "text"
},
"classicid": {
"type": "text"
},
"url": {
"type": "text"
},
"agentstarttime": {
"type": "date"
## ,"format":"yyyy-mm-dd hh:mm:ss.sss||yyyy-mm-dd't'hh:mm:ss.sss||yyyy-mm-dd hh:mm:ss||epoch_millis"
},
"name": {
"type": "keyword"
}
}
}
}
}]]>
"query": {
"bool": {
"filter": [
{ ## 多值检索,查找多个应用名称对应的文档记录
"terms": {
"applicationname.keyword": [#[applicationname1],#[applicationname2]]
}
},
{ ## 时间范围检索,返回对应时间范围内的记录,接受long型的值
"range": {
"agentstarttime": {
"gte": #[starttime],##统计开始时间
"lt": #[endtime] ##统计截止时间
}
}
}
]
}
},
## 最多返回1000条记录
"size":1000
}]]>
"query": {
"bool": {
"filter": [
{
"term": {
"classicid": #[classicid]
}
}],
"must": [
{
"multi_match": {
"query": #[keywords],
"fields": ["contentbody","title","description"]
}
}
]
}
},
## 分页起点
"from":#[from] ,
## 最多返回size条记录
"size":#[size],
"highlight": {
"pre_tags": [
""
],
"post_tags": [
""
],
"fields": {
"*": {}
},
"fragment_size": 2147483647
}
}]]>
"query": {
"bool": {
"filter": [
{
"term": {
"classicid": #[classicid]
}
}]
}
},
## 分页起点
"from":#[from] ,
## 最多返回size条记录
"size":#[size],
"highlight": {
"pre_tags": [
""
],
"post_tags": [
""
],
"fields": {
"*": {}
},
"fragment_size": 2147483647
}
}]]>
"query": {
"bool": {
"filter": [
{
"term": {
"typeid": #[typeid]
}
}],
"must": [
{
"multi_match": {
"query": #[keywords],
"fields": ["contentbody","title","description"]
}
}
]
}
},
## 分页起点
"from":#[from] ,
## 最多返回size条记录
"size":#[size],
"highlight": {
"pre_tags": [
""
],
"post_tags": [
""
],
"fields": {
"*": {}
},
"fragment_size": 2147483647
}
}]]>
"query": {
"bool": {
"filter": [
{
"term": {
"typeid": #[typeid]
}
}]
}
},
## 分页起点
"from":#[from] ,
## 最多返回size条记录
"size":#[size],
"highlight": {
"pre_tags": [
""
],
"post_tags": [
""
],
"fields": {
"*": {}
},
"fragment_size": 2147483647
}
}]]>
"query": {
"bool": {
"filter": [
{ ## 多值检索,查找多个应用名称对应的文档记录
"terms": {
"applicationname.keyword":[
#if($applicationnames && $applicationnames.size() > 0)
#foreach($applicationname in $applicationnames)
#if($velocitycount > 0),#end "$applicationname"
#end
#end
]
}
},
{ ## 时间范围检索,返回对应时间范围内的记录,接受long型的值
"range": {
"agentstarttime": {
"gte": #[starttime],##统计开始时间
"lt": #[endtime] ##统计截止时间
}
}
}
]
}
},
## 最多返回1000条记录
"size":1000
}]]>
hanlp.properties
#本配置文件中的路径的根目录,根目录 其他路径=完整路径(支持相对路径,请参考:https://github.com/hankcs/hanlp/pull/254)
#windows用户请注意,路径分隔符统一使用/
root=h:/doc/java/hzhh123
#root=/home/data/software/devsoft/java/hanlp
#好了,以上为唯一需要修改的部分,以下配置项按需反注释编辑。
#核心词典路径
coredictionarypath=data/dictionary/corenaturedictionary.txt
#2元语法词典路径
bigramdictionarypath=data/dictionary/corenaturedictionary.ngram.txt
#自定义词典路径,用;隔开多个自定义词典,空格开头表示在同一个目录,使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。
#所有词典统一使用utf-8编码,每一行代表一个单词,格式遵从[单词] [词性a] [a的频次] [词性b] [b的频次] ... 如果不填词性则表示采用词典的默认词性。
customdictionarypath=data/dictionary/custom/customdictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf;
#停用词词典路径
corestopworddictionarypath=data/dictionary/stopwords.txt
#同义词词典路径
coresynonymdictionarydictionarypath=data/dictionary/synonym/coresynonym.txt
#人名词典路径
persondictionarypath=data/dictionary/person/nr.txt
#人名词典转移矩阵路径
persondictionarytrpath=data/dictionary/person/nr.tr.txt
#繁简词典根目录
tcdictionaryroot=data/dictionary/tc
#hmm分词模型
hmmsegmentmodelpath=data/model/segment/hmmsegmentmodel.bin
#分词结果是否展示词性
showtermnature=true
#io适配器,实现com.hankcs.hanlp.corpus.io.iioadapter接口以在不同的平台(hadoop、redis等)上运行hanlp
#默认的io适配器如下,该适配器是基于普通文件系统的。
#ioadapter=com.hankcs.hanlp.corpus.io.fileioadapter
#感知机词法分析器
perceptroncwsmodelpath=data/model/perceptron/pku199801/cws.bin
perceptronposmodelpath=data/model/perceptron/pku199801/pos.bin
perceptronnermodelpath=data/model/perceptron/pku199801/ner.bin
#crf词法分析器
crfcwsmodelpath=data/model/crf/pku199801/cws.txt
crfposmodelpath=data/model/crf/pku199801/pos.txt
crfnermodelpath=data/model/crf/pku199801/ner.txt
#更多配置项请参考 https://github.com/hankcs/hanlp/blob/master/src/main/java/com/hankcs/hanlp/hanlp.java#l59 自行添加
注意:参考https://github.com/hankcs/hanlp,下载data.zip文件,解压到h:/doc/java/hzhh123下
hanlp.java
package com.hd.util;
import com.hankcs.hanlp.hanlp;
import com.hankcs.hanlp.corpus.document.sentence.sentence;
import com.hankcs.hanlp.corpus.document.sentence.word.iword;
import com.hankcs.hanlp.model.crf.crflexicalanalyzer;
import java.io.ioexception;
import java.util.arraylist;
import java.util.list;
/**
* hzhh123
* 2019/3/25 14:05
*
* @desciption 自然语言处理 中文分词 词性标注 命名实体识别 依存句法分析
* 新词发现 关键词短语提取 自动摘要 文本分类聚类 拼音简繁
* @link https://github.com/hankcs/hanlp
*/
public class hanlputil {
/**
* @param content
* @return
* @description 提取摘要
*/
public static list summary(string content) {
list summary = hanlp.extractsummary(content, 3);
return summary;
}
/**
* @param content
* @return
* @desciption 提取短语
*/
public static list phrase(string content) {
return hanlp.extractphrase(content, 5);
}
/**
* @param document
* @return
* @throws ioexception
* @desciption 找出相关词性聚合成一个list
*/
public static list findwordsandcollectbylabel(list document) throws ioexception {
/* 对词性进行分析,找出合适的词性 */
crflexicalanalyzer analyzer = new crflexicalanalyzer();
sentence analyzewords = analyzer.analyze(string.valueof(document));
list wordsbylabell = analyzewords.findwordsbylabel("n");
list wordsbylabel2 = analyzewords.findwordsbylabel("ns");
list wordsbylabel3 = analyzewords.findwordsbylabel("t");
list wordsbylabel4 = analyzewords.findwordsbylabel("j");
list wordsbylabel5 = analyzewords.findwordsbylabel("vn");
list wordsbylabel6 = analyzewords.findwordsbylabel("nr");
list wordsbylabel7 = analyzewords.findwordsbylabel("nt");
list wordsbylabel8 = analyzewords.findwordsbylabel("nz");
wordsbylabell.addall(wordsbylabel2);
wordsbylabell.addall(wordsbylabel3);
wordsbylabell.addall(wordsbylabel4);
wordsbylabell.addall(wordsbylabel5);
wordsbylabell.addall(wordsbylabel6);
wordsbylabell.addall(wordsbylabel7);
wordsbylabell.addall(wordsbylabel8);
list words = new arraylist<>();
for (iword word : wordsbylabell) {
words.add(word.getvalue());
}
return words;
}
public static void main(string[] args) {
string document = "算法可大致分为基本算法、数据结构的算法、数论算法、计算几何的算法、图的算法、动态规划以及数值分析、加密算法、排序算法、检索算法、随机化算法、并行算法、厄米变形模型、随机森林算法。\n"
"算法可以宽泛的分为三类,\n"
"一,有限的确定性算法,这类算法在有限的一段时间内终止。他们可能要花很长时间来执行指定的任务,但仍将在一定的时间内终止。这类算法得出的结果常取决于输入值。\n"
"二,有限的非确定算法,这类算法在有限的时间内终止。然而,对于一个(或一些)给定的数值,算法的结果并不是唯一的或确定的。\n"
"三,无限的算法,是那些由于没有定义终止定义条件,或定义的条件无法由输入的数据满足而不终止运行的算法。通常,无限算法的产生是由于未能确定的定义终止条件。";
list sentencelist = phrase(document);
// list sentencelist = summary(document);
system.out.println(sentencelist);
}
}
responseentity.java
package com.hd.util;
import java.util.list;
/**
* hzhh123
* 2019/3/22 11:51
* @descript elasticsearch分页查询查询返回结果内容
*/
public class elasticsearchresponseentity {
private int from=0;
private int size=10;
private long total;
private list records;
public elasticsearchresponseentity(int from, int size) {
this.from = from;
this.size = size;
}
public int getfrom() {
return from;
}
public void setfrom(int from) {
this.from = from;
}
public int getsize() {
return size;
}
public void setsize(int size) {
this.size = size;
}
public long gettotal() {
return total;
}
public void settotal(long total) {
this.total = total;
}
public list getrecords() {
return records;
}
public void setrecords(list records) {
this.records = records;
}
}
elasticsearchclentutil.java
package com.hd.util;
import org.frameworkset.elasticsearch.elasticsearchexception;
import org.frameworkset.elasticsearch.elasticsearchhelper;
import org.frameworkset.elasticsearch.client.clientinterface;
import org.frameworkset.elasticsearch.entity.esbasedata;
import org.frameworkset.elasticsearch.entity.esdatas;
import java.util.hashmap;
import java.util.iterator;
import java.util.list;
import java.util.map;
/**
* hzhh123
*
* es 增删改查实现
* @link https://gitee.com/bboss/bboss-elastic
*
*/
public class elasticsearchclentutil {
private string mappath;
public elasticsearchclentutil(string mappath) {
this.mappath = mappath;
}
/**
* @param indexname 索引名称
* @param indexmapping 表结构名称
* @return
* @description 创建索引库
*/
public string createindex(string indexname, string indexmapping) throws exception {
//加载配置文件,单实例多线程安全的
clientinterface clientutil = elasticsearchhelper.getconfigrestclientutil(mappath);
//判断索引表是否存在
boolean exist = clientutil.existindice(indexname);
if (exist) {
//创建一个mapping之前先删除
clientutil.dropindice(indexname);
}
//创建mapping
return clientutil.createindicemapping(indexname, indexmapping);
}
/**
* @desciption 删除索引
* @param indexname
* @return
*/
public string dropindex(string indexname){
//加载配置文件,单实例多线程安全的
clientinterface clientutil = elasticsearchhelper.getconfigrestclientutil(mappath);
return clientutil.dropindice(indexname);
}
/**
* @param indexname 索引库名称
* @param indextype 索引类型
* @param id 索引id
* @return
* @description 删除文档索引
*/
public string deletedocment(string indexname, string indextype, string id) throws elasticsearchexception {
//加载配置文件,单实例多线程安全的
clientinterface clientutil = elasticsearchhelper.getconfigrestclientutil(mappath);
return clientutil.deletedocument(indexname, indextype, id);
}
/**
* @param indexname 索引库名称
* @param indextype 索引类型
* @param bean
* @return
* @description 添加文档
*/
public string adddocument(string indexname, string indextype,t bean){
//创建创建/修改/获取/删除文档的客户端对象,单实例多线程安全
clientinterface clientutil = elasticsearchhelper.getconfigrestclientutil(mappath);
return clientutil.adddocument(indexname,indextype,bean);
}
/**
*
* @param path _search为检索操作action
* @param templatename esmapper/search.xml中定义的dsl语句
* @param queryfiled 查询参数
* @param keywords 查询参数值
* @param from 分页查询的起始记录,默认为0
* @param size 分页大小,默认为10
* @return
*/
public elasticsearchresponseentity searchdocumentbykeywords(string path, string templatename, string queryfiled, string keywords,
string from, string size, class beanclass) {
//加载配置文件,单实例多线程安全的
clientinterface clientutil = elasticsearchhelper.getconfigrestclientutil(mappath);
map params = new hashmap();
params.put(queryfiled, keywords);
//设置分页参数
params.put("from",from);
params.put("size",size);
elasticsearchresponseentity responseentity = new elasticsearchresponseentity(integer.parseint(from),integer.parseint(size));
//执行查询,search为索引表,_search为检索操作action
esdatas esdatas = //esdatas包含当前检索的记录集合,最多1000条记录,由dsl中的size属性指定
clientutil.searchlist(path,//search为索引表,_search为检索操作action
templatename,//esmapper/search.xml中定义的dsl语句
params,//变量参数
beanclass);//返回的文档封装对象类型
//获取结果对象列表,最多返回1000条记录
list documentlist = esdatas.getdatas();
system.out.println(documentlist==null);
//获取总记录数
long totalsize = esdatas.gettotalsize();
responseentity.settotal(totalsize);
for(int i = 0; documentlist != null && i < documentlist.size(); i ) {//遍历检索结果列表
t doc = documentlist.get(i);
//记录中匹配上检索条件的所有字段的高亮内容
map> highlights = doc.gethighlight();
iterator>> entries = highlights.entryset().iterator();
while (entries.hasnext()) {
map.entry> entry = entries.next();
string fieldname = entry.getkey();
system.out.print(fieldname ":");
list