forked from jerrymagic/Python-Tianyancha
-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyze.py
143 lines (131 loc) · 5.08 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/2/5 15:56
# @Author :
# @Site :
# @File : analyze.py
import xlrd
from openpyxl.workbook import Workbook
import arrow
def exceldata(filename, n):
"""
open excel file read data
:param filename: excel file name
:param n: sheet number
:return: xlrd sheet data
"""
try:
book = xlrd.open_workbook(filename)
sheets = book.sheets()
sheet = sheets[n]
return sheet
except Exception as e:
print(e)
return None
def cleandata(sheet, datalist, dictname, splitstate):
"""
clean xlsx data and create dict
:param sheet: excel sheet
:param datalist: data row
:param dictname: dict txt file name
:param splitstate: rows string splite switch
:return: None
"""
data = []
data_uniq = []
print("生成字典……" + dictname)
with open(dictname, "w", encoding="utf-8") as F:
if splitstate:
print("处理带分割符信息")
for datarow in range(sheet.nrows):
# 处理表头
if datarow != 0:
col = sheet.cell(datarow, datalist).value.split("|,|")
for word in col:
if word != "" and word != "暂无":
data.append(word)
print("信息去重")
data_uniq = sorted(set(data), key=data.index)
for word in data_uniq:
F.writelines(word + "\n")
else:
print("处理不带分隔符信息")
for datarow in range(sheet.nrows):
if datarow != 0:
col = sheet.cell(datarow, datalist).value
if col != "" and col != "暂无":
data.append(col)
print("信息去重")
data_uniq = sorted(set(data), key=data.index)
for word in data_uniq:
F.writelines(word + "\n")
print("处理完成,关闭文件")
F.close()
def analyze(dictfile, sheet, row, outputfile, sheetkeys, headers):
"""
分析关系
:param dictfile: 字典文件
:param sheet: Excel数据表
:param row: 数据行
:param outputfile: 输出文件名
:param sheetkeys: List
:param headers: 标题行
:return:
"""
print("开始分析,请稍后……")
keys = []
with open(dictfile, "r", encoding="utf-8") as R:
for dictrow in R.readlines():
keys.append(dictrow.split("\n")[0])
R.close()
wb = Workbook()
ws = wb.active
ws.append(headers)
for dictkey in keys:
count = 0
resultlist = []
resultstr = ""
for datarow in range(sheet.nrows):
if datarow != 0:
col = sheet.cell(datarow, row).value
if dictkey in col:
# print(len(dictkey), len(col))
if len(dictkey) == len(col):
for rowkey in sheetkeys:
resultlist.append(sheet.cell(datarow, rowkey).value) # 获取符合条件的对应列数据
else:
for word in col.split("|,|"):
if dictkey in word:
if len(dictkey) == len(word):
for rowkey in sheetkeys:
resultlist.append(sheet.cell(datarow, rowkey).value) # 获取符合条件的对应列数据
resultlist = list(set(resultlist))
count = len(resultlist)
for resultone in resultlist:
resultstr += resultone + ","
result = [dictkey, count, resultstr]
ws.append(result)
print("处理完毕,正在生成结果文件……")
wb.save(filename=outputfile)
if __name__ == "__main__":
print("读取数据中……")
sheet = exceldata("result.xlsx", 0)
# 股东信息字典生成
cleandata(sheet, 16, "share_dict.txt", True)
# 法人名称字典
cleandata(sheet, 2, "legal_dict.txt", False)
# 对外投资字典
cleandata(sheet, 17, "invest_dict.txt", True)
# 公司名称
cleandata(sheet, 0, "cmp_dict.txt", False)
# 分析
sheetkeyslist = [0] # 数据表内列
sheetheader = ['法人名称', '拥有公司数量', '参股公司名单']
# 参数(字典,Excel表对象,与字典对应的Excel数据列,输出文件名,数据表内需要分析关联的数据列名,输出文件标题行)
analyze("legal_dict.txt", sheet, 2, "法人与公司关系表.xlsx", sheetkeyslist, sheetheader)
sheetkeyslist = [0]
sheetheader = ['股东名称', '任法人公司数量', '任法人公司名单']
analyze("share_dict.txt", sheet, 2, "股东为公司法人表.xlsx", sheetkeyslist, sheetheader)
sheetkeyslist = [0]
sheetheader = ['股东名称', '投资公司数量', '参股公司名单']
analyze("share_dict.txt", sheet, 16, "股东投资关系表.xlsx", sheetkeyslist, sheetheader)