python搜索引擎pylucene_搜索引擎–Python文本文件分割、PyLucene建立索引和索引搜索...

最新推荐文章于 2023-11-03 18:01:17 发布

原创最新推荐文章于 2023-11-03 18:01:17 发布 · 555 阅读

0 ·

本内容遵循CC 4.0 BY-SA版权协议

标签

#python搜索引擎pylucene

本文介绍了如何使用Python和PyLucene实现简单的搜索引擎。通过splitFiles.py将大文本文件切分为小文件，IndexFiles.py创建索引，SearchFile.py进行搜索。PyLucene是Java Lucene的Python接口，允许在Python环境中利用Lucene的强大搜索功能。

Python3.8

Python 是一种高级、解释型、通用的编程语言，以其简洁易读的语法而闻名，适用于广泛的应用，包括Web开发、数据分析、人工智能和自动化脚本

主机平台：Ubuntu 13.04

Python版本：2.7.4

PyLucene版本：4.4.0

最近想使用Python来做一个小的搜索引擎，一来是强化学习一下Pyhton语言，而来是学习一下搜索引擎实现原理。

在网上搜索了好久，网上的资料和书籍都是使用JAVA平台下的Lucene，而PyLucene是Lucene的Python实现。下面是官网的说明：

PyLucene is a Python extension for accessing Java Lucene TM . Its goal is to allow you to use Lucene’s text indexing and searching capabilities from Python. It is API compatible with the latest version of Java Lucene, version 4.4.0

Pylucene是一个可以使用Lucene的扩展。它的目标是让你能够在Python中使用Lucene的索引和搜索。它的API和最新的JAVA Lucene是兼容的。

虽然官网说API是兼容的，但是对Python还不是很熟悉的情况下，感觉还是有点怵，慢慢摸索吧。

splitFiles.py

它实现了将大文本文件切割成50行的小文本文件。

#!/usr/bin/env python

import os

import sys

import os.path

def split(file):

“”"split file to some small ones”"”

if not os.path.isfile(file):

print file,”is not a file”

exit(1)

txtfile=open(file,”r”)

dirname=os.path.dirname(file)

file_index=0

line_cnt = 0

outfile=open(dirname+”/output_%d”%file_index+’.txt’,’w’)

for line in txtfile:

if line_cnt < 50:

outfile.write(line)

line_cnt+=1

else:

outfile.close()

file_index+=1

outfile=open(dirname+”/output_%d”%file_index+’.txt’,’w’)

line_cnt=0

outfile.close()

txtfile.close()

if __name__ == “__main__”:

base_dir=os.path.dirname(os.path.abspath(sys.argv[0]))

root=os.path.join(base_dir,”txtfiles”)

#print root

for rootdir,dirnames,filenames in os.walk(root):

for filename in filenames:

if not filename.endswith(‘.txt’):

continue

txtname=rootdir+”/”+filename

#print txtname

split(txtname)

IndexFiles.py

它实现了将指定索引目录下的txt文件，并保存索引到指定的目录，供搜索使用。

#!/usr/bin/env python

INDEX_DIR = “IndexFiles.index”

import sys, os, lucene, threading, time

from datetime import datetime

from java.io import File

from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer

from org.apache.lucene.analysis.standard import StandardAnalyzer

from org.apache.lucene.document import Document, Field, FieldType

from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig

from org.apache.lucene.store import SimpleFSDirectory

from org.apache.lucene.util import Version

“”"

This class is loosely based on the Lucene (java implementation) demo class

org.apache.lucene.demo.IndexFiles. It will take a directory as an argument

and will index all of the files in that directory and downward recursively.

It will index on the file path, the file name and the file contents. The

resulting Lucene index will be placed in the current directory and called

‘index’.

“”"

class Ticker(object):

def __init__(self):

self.tick = True

def run(self):

while self.tick:

sys.stdout.write(‘.’)

sys.stdout.flush()

time.sleep(1.0)

class IndexFiles(object):

“”"Usage: python IndexFiles ”"”

def __init__(self, root, storeDir, analyzer):

if not os.path.exists(storeDir):

os.mkdir(storeDir)

store = SimpleFSDirectory(File(storeDir))

analyzer = LimitTokenCountAnalyzer(analyzer, 1000)#1048576

config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)

config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

writer = IndexWriter(store, config)

self.indexDocs(root, writer)

ticker = Ticker()

print ‘commit index’,

threading.Thread(target=ticker.run).start()

writer.commit()

writer.close()

ticker.tick = False

print ‘done’

def indexDocs(self, root, writer):

#Create a new FieldType with default properties.

t1 = FieldType()

t1.setIndexed(True)

t1.setStored(True)

t1.setTokenized(False)#True if this field’s value should be analyzed by the Analyzer.

t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

#Create a new FieldType with default properties.

t2 = FieldType()

t2.setIndexed(True)

t2.setStored(True)

t2.setTokenized(True)#True if this field’s value should be analyzed by the Analyzer.

t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

for root, dirnames, filenames in os.walk(root):

for filename in filenames:

if not filename.endswith(‘.txt’):

continue

print “adding”, filename

try:

path = os.path.join(root, filename)

file = open(path)

contents = file.read()

file.close()

doc = Document()

doc.add(Field(“name”, filename, t1))

doc.add(Field(“path”, root, t1))

if len(contents) > 0:

doc.add(Field(“contents”, contents, t2))

print “length of content is %d”%(len(contents))

else:

print “warning: no content in %s” % filename

writer.addDocument(doc)

except Exception, e:

print “Failed in indexDocs:”, e

if __name__ == ‘__main__’:

if len(sys.argv) < 1:

print IndexFiles.__doc__

sys.exit(1)

lucene.initVM(vmargs=[‘-Djava.awt.headless=true’])

print ‘lucene’, lucene.VERSION

start = datetime.now()

try:

base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))

print base_dir

print os.path.abspath(sys.argv[0])

IndexFiles(“./txtfiles”, os.path.join(base_dir, INDEX_DIR),

StandardAnalyzer(Version.LUCENE_CURRENT))

end = datetime.now()

print end – start

except Exception, e:

print “Failed: “, e

raise e

SearchFile.py

它就是搜索前面生成的索引，输入搜索结果。

#!/usr/bin/env python

INDEX_DIR = “IndexFiles.index”

import sys, os, lucene

from java.io import File

from org.apache.lucene.analysis.standard import StandardAnalyzer

from org.apache.lucene.index import DirectoryReader

from org.apache.lucene.index import Term

from org.apache.lucene.queryparser.classic import QueryParser

from org.apache.lucene.store import SimpleFSDirectory

from org.apache.lucene.search import IndexSearcher

from org.apache.lucene.search import Query, TermQuery

from org.apache.lucene.util import Version

“”"

This script is loosely based on the Lucene (java implementation) demo class

org.apache.lucene.demo.SearchFiles. It will prompt for a search query, then it

will search the Lucene index in the current directory called ‘index’ for the

search query entered against the ‘contents’ field. It will then display the

‘path’ and ‘name’ fields for each of the hits it finds in the index. Note that

search.close() is currently commented out because it causes a stack overflow in

some cases.

“”"

def run(searcher, analyzer):

while True:

print “Hit enter with no input to quit.”

command = raw_input(“Query:”)

if command == ”:

return

print “Searching for:”, command

“”"

query = QueryParser(Version.LUCENE_CURRENT, “contents”,

analyzer).parse(command)

“”"

query = TermQuery(Term(“contents”, command))

hits = searcher.search(query,10000)

print “%s total matching documents.” % hits.totalHits

print “Max score:”,hits.getMaxScore()

for hit in hits.scoreDocs:

doc = searcher.doc(hit.doc)

print ‘URI:’,doc.getField(“path”).stringValue()

print ‘File:’,doc.getField(‘name’).stringValue()

#print ‘Digest:’,doc.getField(‘contents’).stringValue()

print ‘Health:’,hit.score

if __name__ == ‘__main__’:

lucene.initVM(vmargs=[‘-Djava.awt.headless=true’])

print ‘lucene’, lucene.VERSION

base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))

directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))

searcher = IndexSearcher(DirectoryReader.open(directory))

analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

run(searcher, analyzer)

del searcher

下面是建立索引部分结果输出：

下面是搜索的部分结果输出：

欢迎加入我爱机器学习QQ14群：336582044

微信扫一扫，关注我爱机器学习公众号

您可能感兴趣的与本文相关的镜像

Python3.8

Conda

Python

Python 是一种高级、解释型、通用的编程语言，以其简洁易读的语法而闻名，适用于广泛的应用，包括Web开发、数据分析、人工智能和自动化脚本

显存	CPU	内存	系统盘	数据盘
24GB	10核心	120GB	50GB	40GB