1:用solrj 的api 调用封装SolrInputDocument 提交solr 服务创建索引
package test.client.impl;
import java.io.IOException;
import java.net.MalformedURLException;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.client.solrj.request.CoreAdminRequest;
import org.apache.solr.common.SolrInputDocument;
public class DBTestDao extends BaseDao {
private static final Logger LOGGER = Logger.getLogger(DBTestDao.class);
private int per = 2000;
private int count = 74000;
private int start = 0;
private int coreCount = 5;
private String targetCore = "http://127.0.0.1:8000/solr/targetCore";
private String url = "http://127.0.0.1:8000/solr/core";
private String indexParentPath = "D:/solr_OB1B/app/solr_multcore/core";
static {
PropertyConfigurator.configure("log4j.properties");
}
private void removeOtherIndex() throws SolrServerException, IOException {
for (int i = 0; i < coreCount; i++) {
CommonsHttpSolrServer server = new CommonsHttpSolrServer(url + i);
server.deleteByQuery("*:*");
server.optimize();
server.commit();
}
}
private void removeTargetIndex() {
try {
CommonsHttpSolrServer server = new CommonsHttpSolrServer(targetCore);
server.deleteByQuery("*:*");
server.optimize();
server.commit();
LOGGER.info("清空targetCore的数据");
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SolrServerException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private List<SolrInputDocument> getDocList() {
String sql = "select num,metaid,active,productname,catalogid,url,price,count_productmeta,minimum_price,maximum_price,catalogid1,catalogname1,catalogid2,catalogname2,catalogid3,catalogname3,brandid,brandname, imagepath ,[rank],[description],displayname,updatetime,createdatetime,isfirstproduct,commoncount,type from DataCenter_TempTable as t where t.num>"
+ start + " AND t.num<" + (start + per);
List<SolrInputDocument> list = new ArrayList<SolrInputDocument>();
try {
Statement stat = conn.createStatement();
LOGGER.info(sql);
ResultSet rs = stat.executeQuery(sql);
while (rs.next()) {
SolrInputDocument doc = new SolrInputDocument();
doc.addField("metaid", rs.getObject("metaid"));
doc.addField("id", rs.getObject("num"));
//doc.addField("price", rs.getObject("price"));
doc.addField("url", rs.getObject("url"));
doc.addField("count_productmeta", rs.getObject("count_productmeta"));
doc.addField("minimum_price", rs.getObject("minimum_price"));
doc.addField("maximum_price", rs.getObject("maximum_price"));
doc.addField("rank", rs.getObject("rank"));
doc.addField("imagepath", rs.getObject("imagepath"));
doc.addField("displayname", rs.getObject("displayname"));
doc.addField("description", rs.getObject("description"));
doc.addField("brandfacet", rs.getObject("num"));
doc.addField("brandname", rs.getObject("brandname"));
doc.addField("productname", rs.getObject("productname"));
doc.addField("catalogid3", rs.getObject("metaid"));
doc.addField("catalogfacet3", rs.getObject("catalogname3"));
doc.addField("catalogid2", rs.getObject("metaid"));
doc.addField("catalogfacet2", rs.getObject("catalogname2"));
doc.addField("catalogid1", rs.getObject("metaid"));
doc.addField("catalogfacet1", rs.getObject("catalogname1"));
doc.addField("updatetime", rs.getObject("updatetime"));
doc.addField("createdatetime", rs.getObject("createdatetime"));
doc.addField("isfirstproduct", rs.getObject("isfirstproduct"));
doc.addField("catalogid", rs.getObject("catalogid"));
doc.addField("commoncount", rs.getObject("commoncount"));
String searchText = rs.getString("brandname") + rs.getString("catalogname3")
+ rs.getString("productname") + rs.getInt("catalogid1") + rs.getInt("catalogid2")
+ rs.getInt("catalogid3");
doc.addField("searchText", searchText);
list.add(doc);
}
start = start + per;
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
public void startCoreIndex(int core) throws SolrServerException, IOException {
String solrUrl = url + core;
List<SolrInputDocument> list = getDocList();
CommonsHttpSolrServer server = new CommonsHttpSolrServer(solrUrl);
server.add(list);
LOGGER.info(".........开始为core" + core + "提交数据!!!数据开始【" + (start - 2000) + "】结束【" + start + "】");
server.commit();
LOGGER.info(".........core" + core + "数据提交结束");
}
public void mergeIndex() throws SolrServerException, IOException {
for (int i = 0; i < coreCount; i++) {
LOGGER.info("开始合并:core" + i);
String[] strs = { indexParentPath + i + "/data/index" };
CoreAdminRequest.mergeIndexes("targetCore", strs, new CommonsHttpSolrServer("http://127.0.0.1:8000/solr"));
LOGGER.info("core" + i + "合并完成.....");
}
String targetCore = this.targetCore;
CommonsHttpSolrServer targetServer = new CommonsHttpSolrServer(targetCore);
targetServer.commit();
targetServer.optimize();
LOGGER.info("所有 core合并完成.....");
}
public void process() throws SolrServerException, IOException {
removeTargetIndex();
long startDate = new Date().getTime();
while (start < count) {
// 1.调用每一个core开始创建索引
for (int i = 0; i < coreCount; i++) {
if (start < count) {
startCoreIndex(i);
}
}
}
LOGGER.info("数据索引完成");
mergeIndex();
long endDate = new Date().getTime();
System.out.println("~~~~~~~~~~~~~~~~~~~" + (endDate - startDate));
removeOtherIndex();
}
public static void main(String[] args) {
DBTestDao dao = new DBTestDao();
try {
dao.process();
} catch (SolrServerException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
package test.client.impl;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
public class BaseDao {
private static final Logger LOGGER = Logger.getLogger(BaseDao.class);
private static String url = "jdbc:sqlserver://10.16.230.40:1433;DatabaseName=CrawlerDataCenter_Zol_Test";
private static String password = "xalab@123";
private static String username = "sa";
protected static Connection conn;
static {
try {
PropertyConfigurator.configure("log4j.properties");
Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver");
conn = DriverManager.getConnection(url, username, password);
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public BaseDao() {
}
public static void main(String[] args) {
BaseDao dao = new BaseDao();
}
}
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="example" version="1.2">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<fieldtype name="textZN" class="solr.TextField">
<analyzer class="org.wltea.analyzer.lucene.IKAnalyzer"/>
<analyzer type="index">
<tokenizer class="org.wltea.analyzer.solr.IKTokenizerFactory" isMaxWordLength="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
</analyzer>
</fieldtype>
</types>
<fields>
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="itemPrice" type="float" indexed="true" stored="true" />
<field name="metaid" type="int" indexed="true" stored="true" />
<field name="imagepath" type="string" indexed="true" stored="true" />
<field name="brandid" type="int" indexed="true" stored="false" />
<field name="brandfacet" type="string" indexed="true" stored="true" />
<field name="brandname" type="string" indexed="true" stored="true" />
<field name="url" type="string" indexed="true" stored="true" />
<field name="count_productmeta" type="string" indexed="true" stored="true" />
<field name="minimum_price" type="float" indexed="true" stored="true" />
<field name="maximum_price" type="float" indexed="true" stored="true" />
<field name="rank" type="int" indexed="true" stored="true" />
<field name="displayname" type="string" indexed="true" stored="true" />
<field name="description" type="textZN" indexed="true" stored="true" />
<field name="pid" type="int" indexed="true" stored="false" multiValued="true" />
<field name="propFacet" type="string" indexed="true" stored="true" multiValued="true" />
<field name="productname" type="string" indexed="true" stored="true" />
<field name="catalogid1" type="int" indexed="true" stored="false" />
<field name="catalogfacet1" type="string" indexed="true" stored="true" />
<field name="catalogid2" type="int" indexed="true" stored="false" />
<field name="catalogfacet2" type="string" indexed="true" stored="true" />
<field name="catalogid3" type="int" indexed="true" stored="false" />
<field name="catalogfacet3" type="string" indexed="true" stored="true" />
<field name="updatetime" type="string" indexed="true" stored="true" />
<field name="createdatetime" type="string" indexed="true" stored="true" />
<field name="isfirstproduct" type="int" indexed="true" stored="true" />
<field name="catalogid" type="int" indexed="true" stored="true" />
<field name="commoncount" type="int" indexed="true" stored="true" />
<field name="catalogtype" type="string" indexed="true" stored="true" />
<field name="autocompleteword" type="textZN" indexed="true" stored="true" />
<field name="searchText" type="textZN" indexed="true" stored="true" />
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
RESTRICTION: the glob-like pattern in the name attribute must have
a "*" only at the start or the end.
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
Longer patterns will be matched first. if equal size patterns
both match, the first appearing in the schema will be used. -->
<dynamicField name="*_i" type="int" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>searchText</defaultSearchField>
<solrQueryParser defaultOperator="AND"/>
<!--
<copyField source="itemName" dest="searchText" />
<copyField source="itemBrand" dest="searchText" />
<copyField source="itemCatalog" dest="searchText" />
<copyField source="itemDesc" dest="searchText" />
-->
</schema>
2:通过data-config.xml 配置进行数据导入
import java.io.File;
import java.io.IOException;
import java.util.Date;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.client.solrj.request.CoreAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import com.newegg.labxa.ob1b.search.tools.OpertProperty;
import com.newegg.labxa.ob1b.search.tools.SolrServiceFactory;
public class SolrCreateIndex {
private static final Logger LOGGER = Logger.getLogger(SolrCreateIndex.class);
private static final OpertProperty OPT = OpertProperty.getInstance();
private static final String TARGET_CORE_NAME = OPT.getValue(OpertProperty.TARGET_CORE);
private static String createSolrUrl = OPT.getValue(OpertProperty.CREATE_SOLR_URL);
private static int start = 1;
private static int size = 1000;
static {
size = OPT.getValueForInteger(OpertProperty.CORE_DATA_NUM);
if (!createSolrUrl.endsWith("/")) {
createSolrUrl = createSolrUrl + "/";
}
PropertyConfigurator.configure("log4j.properties");
}
/**
* multicore创建索引的入口函数
*/
public static void createIndex(int totalNum) {
//在索引要全部建之前,先清空targetCore里边旧索引
deleteCoreIndex(new String[] { TARGET_CORE_NAME });
String[] cores = OPT.getValueForStringArray(OpertProperty.SOLR_CORES);
if (cores == null || cores.length == 0) {
return;
}
long startDate = new Date().getTime();
while (start < totalNum) {
// 1.调用每一个core开始创建索引
for (int i = 0; i < cores.length; i++) {
if (start < totalNum) {
startOneCoreCreate(createSolrUrl + cores[i] + "/");
}
}
}
//2.merging(合并每一个core下面的索引到targetCore里)
try {
mergingIndex(cores, TARGET_CORE_NAME);
long endDate = new Date().getTime();
System.out.println("~~~~~~~~~~~~~~~~~~~" + (endDate - startDate));
} catch (InterruptedException e) {
if (LOGGER.isInfoEnabled()) {
LOGGER.info(e);
}
} catch (Exception e) {
if (LOGGER.isInfoEnabled()) {
LOGGER.info(e);
}
}
// 3.删除core0,core1,core2...下面的index
deleteCoreIndex(cores);
}
/**
* 启动每一个core实现对index的创建
*/
private static void startOneCoreCreate(String url) {
CommonsHttpSolrServer server = SolrServiceFactory.getSolrServer(url);
// 判断当前的core的index是否创建完成
boolean indexIsOk = createIndexIsOk(server);
if (!indexIsOk) {
return;
}
ModifiableSolrParams params = new ModifiableSolrParams();
params.add("qt", "/dataimport");
params.add("clean", "false");
params.add("command", "full-import");
params.add("commit", "true");
params.add("startid", String.valueOf(start));
params.add("endid", String.valueOf(start + size));
try {
start = start + size; // 修改下标
server.query(params);
server.commit();
} catch (SolrServerException e) {
if (LOGGER.isInfoEnabled()) {
LOGGER.info(e);
}
start = start - size; // 回滚修改的下标
} catch (IOException e) {
if (LOGGER.isInfoEnabled()) {
LOGGER.info(e);
}
start = start - size; // 回滚修改的下标
}
LOGGER.info("创建索引:" + url + "start=" + (start - size) + " end=" + start);
}
/**
* 索引的合并
*/
private static void mergingIndex(String[] cores, String targetCoreName) throws Exception {
String indexParentPath = OPT.getValue(OpertProperty.INDEX_PARENT_PATH);
if (!indexParentPath.endsWith("/")) {
indexParentPath = indexParentPath + "/";
}
CommonsHttpSolrServer server = SolrServiceFactory.getSolrServer(createSolrUrl);
for (String coreName : cores) {
LOGGER.info("开始合并:" + coreName);
String indexPath = indexParentPath + coreName + "/data/index";
File dir = new File(indexPath);
if (!dir.exists() && !dir.isDirectory()) {
throw new RuntimeException(indexPath);
}
String[] strs = { indexParentPath + coreName + "/data/index" };
/* --------------------判断core的索引是否创建完成----------------------------- */
String oneCoreUrl = createSolrUrl + coreName + "/";
CommonsHttpSolrServer oneCoreServer = SolrServiceFactory.getSolrServer(oneCoreUrl);
int i = 0;
while (i < 500) {
boolean isOver = createIndexIsOk(oneCoreServer);
if (isOver) {
break;
}
// 防止死循环!!!
i++;
}
/* --------------------判断core的索引是否创建完成----------------------------- */
LOGGER.info(targetCoreName);
for (String s : strs) {
LOGGER.info(s);
}
LOGGER.info(createSolrUrl);
CoreAdminRequest.mergeIndexes(targetCoreName, strs, server);
LOGGER.info(coreName + "合并完成.....");
}
String targetCore = createSolrUrl + targetCoreName;
CommonsHttpSolrServer targetServer = SolrServiceFactory.getSolrServer(targetCore);
targetServer.commit();
targetServer.optimize();
LOGGER.info("索引全部合并完成....");
}
/**
* 判断core是是否已经创建完成
* (通过给http:127.0.0.1:8080/solr/colrxx/dataimport来获得DIH线程的状态是'busy'还是'idle')
*/
private static boolean createIndexIsOk(CommonsHttpSolrServer oneCoreServer) {
// 防止判断core状态时刷新太快,所以每次判断时先sleep 2s
try {
Thread.sleep(2 * 1000);
} catch (Exception e) {
if (LOGGER.isInfoEnabled()) {
LOGGER.info(e);
}
}
System.out.println("*************************************************");
ModifiableSolrParams params = new ModifiableSolrParams();
params.add("qt", "/dataimport");
SolrRequest request = new QueryRequest(params);
try {
NamedList<Object> names = oneCoreServer.request(request);
String diHStart = (String) names.get("status");
LOGGER.info("core的状况:" + oneCoreServer.getBaseURL() + " :" + diHStart);
if (diHStart != null && "idle".equalsIgnoreCase(diHStart)) {
return true;
}
} catch (SolrServerException e) {
if (LOGGER.isInfoEnabled()) {
LOGGER.info(e);
}
} catch (IOException e) {
if (LOGGER.isInfoEnabled()) {
LOGGER.info(e);
}
}
return false;
}
/**
* 删除core下面的index
*
* @param cores
*/
private static void deleteCoreIndex(String[] cores) {
for (String core : cores) {
String coreUrl = createSolrUrl + core;
CommonsHttpSolrServer coreServer = SolrServiceFactory.getSolrServer(coreUrl);
try {
coreServer.deleteByQuery("*:*");
coreServer.optimize();
coreServer.commit();
if (LOGGER.isInfoEnabled()) {
LOGGER.info("删除" + core + "上的索引完成");
}
} catch (SolrServerException e) {
if (LOGGER.isInfoEnabled()) {
LOGGER.info(e);
}
} catch (IOException e) {
if (LOGGER.isInfoEnabled()) {
LOGGER.info(e);
}
}
}
}
/**
* 对targetCore只执行delta-import,但不进行优化,这样就在replication时只把update的index发给slave机
*/
public static void deltaIndex() {
String targetCoreUrl = createSolrUrl + TARGET_CORE_NAME;
CommonsHttpSolrServer server = SolrServiceFactory.getSolrServer(targetCoreUrl);
ModifiableSolrParams params = new ModifiableSolrParams();
params.add("qt", "/dataimport");
params.add("clean", "false");
params.add("command", "delta-import");
params.add("commit", "true");
// 这里增量的索引不进行合并,这样就每次给slave机分发新增的部分
try {
server.query(params);
server.commit();
} catch (SolrServerException e) {
if (LOGGER.isInfoEnabled()) {
LOGGER.info(e);
}
} catch (IOException e) {
if (LOGGER.isInfoEnabled()) {
LOGGER.info(e);
}
}
LOGGER.info("更新索引:" + targetCoreUrl);
}
public static void main(String[] args) {
//deleteCoreIndex(new String[] { "targetCore" });
createIndex(74000);
//deltaIndex();
}
}
Sql 总记录 74000 条
方式一:
data-config.xml 配置方式索引
索引 field 31 个
core数量 5个
生成 index 文件 141MB
共耗时 93061ms
平均 0.001515MB/ms
方式二:
Api SolrInputDocument 方式提交
索引 field 25 个
core数量 5个
生成 index 文件 113MB
共耗时 66061ms
平均 0.001710MB/ms
本文详细介绍了如何利用Solr的API和data-config.xml配置文件,通过多核心创建索引和数据导入流程,实现高效的数据存储与检索。包括使用SolrInputDocument提交数据至Solr服务,构建索引,以及通过多核心合并索引至目标核心的过程,最终实现数据的高效管理和快速查询。

723

被折叠的 条评论
为什么被折叠?



