当先锋百科网

首页 1 2 3 4 5 6 7

直接上代码吧

# -*- coding: utf-8 -*-

from elasticsearch import Elasticsearch


# 日志的配置环境
import platform
import logging.handlers
sys_platform = platform.system()
if sys_platform == "Windows":
    LOG_FILE_check = './app_cic.txt'
else:
    LOG_FILE_check = '/cic1.log'
handler = logging.handlers.RotatingFileHandler(LOG_FILE_check, maxBytes=1200 * 1024 * 1024,backupCount=10)  # 实例化handler  200M 最多十个文件
fmt = '\n' + '%(message)s'
formatter = logging.Formatter(fmt)  # 实例化formatter
handler.setFormatter(formatter)  # 为handler添加formatter
logger = logging.getLogger('check')  # 获取名为tst的logger
logger.addHandler(handler)  # 为logger添加handler
logger.setLevel(logging.DEBUG)


# es = Elasticsearch()
es = Elasticsearch(["20.0.0.11:9200"], sniff_on_start=True, sniff_on_connection_fail=True,sniff_timeout=60)

import time

query_json = {
    "query":{
        "terms":{
            "site":[
                "百度搜索"
            ]
        }

    }
}



page_num = 1000  # 每次获取数据
query = es.search(index='guoyan_index_v1', body=query_json, scroll='5m', size=page_num)
results = query['hits']['hits']  # es查询出的结果第一页
total = query['hits']['total']  # es查询出的结果总量
scroll_id = query['_scroll_id']  # 游标用于输出es查询出的所有结果
every_num = int(total/page_num)  #
# print(results)
print("total",total)
print("scroll_id",scroll_id)
print("every_num",every_num)

alist = []
end_data_list = []
print("----------",int(total/page_num)+1)
for i in range(0, every_num):
# for i in range(100, 1000):
    print("正在读取的位置是:",i)
    results_list = es.scroll(scroll_id=scroll_id, scroll='5m')['hits']['hits']
    for key in results_list:
        try:
            source = key['_source']["source"]
            other6 = key['_source']["other6"]
            result_str = other6 + " " + source
            end_data_list.append(result_str)
        except:
            pass
end_data_list = list(set(end_data_list))
print("去重以后的数据是条数是:",len(end_data_list))
for end_data in end_data_list:
    logger.info(end_data)