package com.crazyice.lee.accumulation.search.service;
import com.alibaba.fastjson.JSONObject;
import com.crazyice.lee.accumulation.search.data.Article;
import com.crazyice.lee.accumulation.search.utils.Md5CaculateUtil;
import io.searchbox.client.JestClient;
import io.searchbox.core.Index;
import io.searchbox.core.Search;
import io.searchbox.core.SearchResult;
import lombok.extern.slf4j.Slf4j;
import net.sf.jmimemagic.*;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
@Component
@Slf4j
public
class
DirectoryRecurse {
@Autowired
private
JestClient jestClient;
private
String readToString(File file, String
fileType
) {
StringBuffer result =
new
StringBuffer();
switch
(
fileType
) {
case
"text/plain"
:
case
"java"
:
case
"c"
:
case
"cpp"
:
case
"txt"
:
try
(FileInputStream in =
new
FileInputStream(file)) {
Long filelength = file.length();
byte[] filecontent =
new
byte[filelength.intValue()];
in.read(filecontent);
result.append(
new
String(filecontent,
"utf8"
));
}
catch
(FileNotFoundException e) {
log.error(
"{}"
, e.getLocalizedMessage());
}
catch
(IOException e) {
log.error(
"{}"
, e.getLocalizedMessage());
}
break
;
case
"doc"
:
try
(FileInputStream in =
new
FileInputStream(file)) {
WordExtractor extractor =
new
WordExtractor(in);
result.append(extractor.
getText
());
}
catch
(Exception e) {
log.error(
"{}"
, e.getLocalizedMessage());
}
break
;
case
"docx"
:
try
(FileInputStream in =
new
FileInputStream(file); XWPFDocument doc =
new
XWPFDocument(in)) {
XWPFWordExtractor extractor =
new
XWPFWordExtractor(doc);
result.append(extractor.
getText
());
}
catch
(Exception e) {
log.error(
"{}"
, e.getLocalizedMessage());
}
break
;
}
return
result.toString();
}
private
JSONObject isIndex(File file) {
JSONObject result =
new
JSONObject();
String fileFingerprint = Md5CaculateUtil.getMD5(file);
result.put(
"fileFingerprint"
, fileFingerprint);
SearchSourceBuilder searchSourceBuilder =
new
SearchSourceBuilder();
searchSourceBuilder.query(QueryBuilders.termQuery(
"fileFingerprint"
, fileFingerprint));
Search search =
new
Search.Builder(searchSourceBuilder.toString()).addIndex(
"diskfile"
).addType(
"files"
).build();
try
{
SearchResult searchResult = jestClient.execute(search);
if
(searchResult.getTotal() > 0) {
result.put(
"isIndex"
, true);
}
else
{
result.put(
"isIndex"
, false);
}
}
catch
(IOException e) {
log.error(
"{}"
, e.getLocalizedMessage());
}
return
result;
}
private
void createIndex(File file, String method) {
if
(file.getName().startsWith(
"~$"
))
return
;
String
fileType
= null;
switch
(method) {
case
"magic"
:
Magic parser =
new
Magic();
try
{
MagicMatch match = parser.getMagicMatch(file, false);
fileType
= match.getMimeType();
}
catch
(MagicParseException e) {
}
catch
(MagicMatchNotFoundException e) {
}
catch
(MagicException e) {
}
break
;
case
"ext"
:
String filename = file.getName();
String[] strArray = filename.split(
"\\."
);
int suffixIndex = strArray.length - 1;
fileType
= strArray[suffixIndex];
}
switch
(
fileType
) {
case
"text/plain"
:
case
"java"
:
case
"c"
:
case
"cpp"
:
case
"txt"
:
case
"doc"
:
case
"docx"
:
JSONObject isIndexResult = isIndex(file);
log.info(
"文件名:{},文件类型:{},MD5:{},建立索引:{}"
, file.getPath(),
fileType
, isIndexResult.getString(
"fileFingerprint"
), isIndexResult.getBoolean(
"isIndex"
));
if
(isIndexResult.getBoolean(
"isIndex"
))
break
;
Article article =
new
Article();
article.setTitle(file.getName());
article.setAuthor(file.getParent());
article.setPath(file.getPath());
article.setContent(readToString(file,
fileType
));
article.setFileFingerprint(isIndexResult.getString(
"fileFingerprint"
));
Index index =
new
Index.Builder(article).index(
"diskfile"
).type(
"files"
).build();
try
{
if
(!jestClient.execute(index).getId().isEmpty()) {
log.info(
"构建索引成功!"
);
}
}
catch
(IOException e) {
log.error(
"{}"
, e.getLocalizedMessage());
}
break
;
}
}
public
void find(String pathName) throws IOException {
File dirFile =
new
File(pathName);
if
(!dirFile.exists()) {
log.info(
"do not exit"
);
return
;
}
if
(!dirFile.isDirectory()) {
if
(dirFile.isFile()) {
createIndex(dirFile,
"ext"
);
}
return
;
}
String[] fileList = dirFile.list();
for
(int i = 0; i <p><strong>扫描任务</strong><br></p><p>这里采用定时任务的方式来扫描指定目录以实现动态增量创建索引。</p><pre
class
=
"brush:php;toolbar:false"
>package com.crazyice.lee.accumulation.search.service;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.io.IOException;
@Configuration
@Component
@Slf4j
public
class
CreateIndexTask {
@Autowired
private
DirectoryRecurse directoryRecurse;
@Value(
"${index-root}"
)
private
String indexRoot;
@Scheduled(cron =
"* 0/5 * * * ?"
)
private
void addIndex(){
try
{
directoryRecurse.find(indexRoot);
directoryRecurse.writeIndexStatus();
}
catch
(IOException e) {
log.error(
"{}"
,e.getLocalizedMessage());
}
}
}