Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PM2.5 #2

Open
wants to merge 26 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions PM2.5 陈小松 .md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
项目流程:
1.设计模型:
先整理集合,绘制散点图,因为建立的是Linear Regression模型,结合散点图选择features.
2.定义LOSS函数
设计Loss函数,梯度下降
总结:
在项目进行中,发现自己python掌握很差,应当多学习学习
对LOSS函数的建立,及梯度下降还是不太懂,应加强学习。
49 changes: 49 additions & 0 deletions PM2.5 陈小松.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv(r'C:\Users\18801\Documents\WeChat Files\wxid_8vs9i3nhli4u22\FileStorage\File\2019-05\2017fall-ml-hw1-master\train.csv')

pm2_5 = data[data['class']=='PM2.5'].ix[:, 3:]

tempxlist = []
tempylist = []
for i in range(15):
tempx = pm2_5.iloc[:, i:i+9]
tempx.columns = np.array(range(9))
tempy = pm2_5.iloc[:, i+9]
tempy.columns = ['1']
tempxlist.append(tempx)
tempylist.append(tempy)
xdata = pd.concat(tempxlist)
x = np.array(xdata, float)
ydata = pd.concat(tempylist)
y = np.array(ydata, float)

x = np.concatenate((np.ones((x.shape[0], 1)), x), axis=1)

w = np.zeros((len(x[0])))
lr = 10
iteration = 10000
s_grad = np.zeros(len(x[0]))
for i in range(iteration):
tem = np.dot(x, w)
loss = y - tem
grad = np.dot(x.transpose(), loss)*(-2)
s_grad += grad**2
ada = np.sqrt(s_grad)
w = w - lr*grad/ada

testdata = pd.read_csv(r'C:\Users\18801\Documents\WeChat Files\wxid_8vs9i3nhli4u22\FileStorage\File\2019-05\2017fall-ml-hw1-master\test.csv')
pm2_5_test = testdata[testdata['class']=='PM2.5'].ix[:, 2:]
x_test = np.array(pm2_5_test, float)
x_test_b = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1)
y_star = np.dot(x_test_b, w)
y_pre = pd.read_csv(r'C:\Users\18801\Documents\WeChat Files\wxid_8vs9i3nhli4u22\FileStorage\File\2019-05\2017fall-ml-hw1-master\sampleSubmission.csv', encoding='gbk')
y_pre.value = y_star

real = pd.read_csv(r'C:\Users\18801\Documents\WeChat Files\wxid_8vs9i3nhli4u22\FileStorage\File\2019-05\2017fall-ml-hw1-master\ans.csv')
erro = abs(y_pre.value - real.value).sum()/len(real.value)
print(erro)

输出:4.97442948413
179 changes: 179 additions & 0 deletions PM2.5_ 宋文宇.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import pandas as pd
import matplotlib.pyplot as plt
from random import randint
from numba import jit

# @jit
def test():
'''读取数据'''
data = pd.read_csv('train.csv') #DataFrame类型
# print(data)
# # del data[:3]
del data['datetime']
del data['stations']
del data['observations']
data.drop([0])
# print(data)
data = data.replace('NR', 0)
# print(data)
# print(data)


'''整理训练集合'''
ItemNum=18
X_Train=[] #训练样本features集合
Y_Train=[] #训练样本目标PM2.5集合
for i in range(int(len(data)/ItemNum)):
day = data[i*ItemNum:(i+1)*ItemNum] #一天的观测数据
for j in range(15):
x = day.iloc[:, j:j + 9]
y = int(day.iloc[9,j+9])
X_Train.append(x)
Y_Train.append(y)


'''绘制散点图'''
x_AMB_TEMP=[]
x_CH4=[]
x_CO=[]
x_NMHC=[]
y=[]
for i in range(len(Y_Train)):
y.append(Y_Train[i])
x=X_Train[i]
#求各测项的平均值
x_AMB_TEMP_sum=0
x_CH4_sum=0
x_CO_sum=0
x_NMHC_sum=0
for j in range(9):
# print(x.iloc[0,j])
x_AMB_TEMP_sum=x_AMB_TEMP_sum+float(x.iloc[0,j])
# print(x.iloc[1, j])
x_CH4_sum = x_CH4_sum + float(x.iloc[1, j])
x_CO_sum = x_CO_sum + float(x.iloc[2, j])
x_NMHC_sum = x_NMHC_sum + float(x.iloc[3, j])
x_AMB_TEMP.append(x_AMB_TEMP_sum / 9)
x_CH4.append(x_CH4_sum / 9)
x_CO.append(x_CO_sum / 9)
x_NMHC.append(x_NMHC_sum / 9)
plt.figure(figsize=(10, 6))
plt.subplot(2, 2, 1)
plt.title('AMB_TEMP')
plt.scatter(x_AMB_TEMP, y)
plt.subplot(2, 2, 2)
plt.title('CH4')
plt.scatter(x_CH4, y)
plt.subplot(2, 2, 3)
plt.title('CO')
plt.scatter(x_CO, y)
plt.subplot(2, 2, 4)
plt.title('NMHC')
plt.scatter(x_NMHC, y)
plt.show()

'''小批量梯度下降'''
dict={0:8,1:8,2:8,3:8,4:8,5:8,6:8,7:8,8:8,9:9,10:9,11:9,12:9,13:9,14:9,15:9,16:9,17:9,18:12,19:12,20:12,21:12,22:12,23:12,24:12,25:12,26:12}
iteration_count = 100 #迭代次数
learning_rate = 0.000001 #学习速率
b=0.0001 #初始化偏移项
parameters=[0.001]*27 #初始化27个参数
loss_history=[]
for i in range(iteration_count):
loss=0
b_grad=0
w_grad=[0]*27
examples=list(randint(0, len(X_Train)-1) for index in range(100))
print("标记1",i,"\n")
for j in range(100):
index=examples.pop()
day = X_Train[index]
# print(day)
# # print(parameters[0])
# print(day.iloc[8,0])
# print(day.iloc[8,1])
# print(type(day.iloc[8,1]))


# print(day.iloc[8,:])
# print(day.iloc[9,:])
# print(day.iloc[12,:])
# # print(day.iloc[8,1])
# # print(day.iloc[8,2])
# # print(day.iloc[8,1])

partsum = b+parameters[0]*float(day.iloc[8,0])+parameters[1]*float(day.iloc[8,1])+parameters[2]*float(day.iloc[8,2])+parameters[3]*float(day.iloc[8,3])+parameters[4]*float(day.iloc[8,4])+parameters[5]*float(day.iloc[8,5])+parameters[6]*float(day.iloc[8,6])+parameters[7]*float(day.iloc[8,7])+parameters[8]*float(day.iloc[8,8])+parameters[9]*float(day.iloc[9,0])+parameters[10]*float(day.iloc[9,1])+parameters[11]*float(day.iloc[9,2])+parameters[12]*float(day.iloc[9,3])+parameters[13]*float(day.iloc[9,4])+parameters[14]*float(day.iloc[9,5])+parameters[15]*float(day.iloc[9,6])+parameters[16]*float(day.iloc[9,7])+parameters[17]*float(day.iloc[9,8])+parameters[18]*float(day.iloc[12,0])+parameters[19]*float(day.iloc[12,1])+parameters[20]*float(day.iloc[12,2])+parameters[21]*float(day.iloc[12,3])+parameters[22]*float(day.iloc[12,4])+parameters[23]*float(day.iloc[12,5])+parameters[24]*float(day.iloc[12,6])+parameters[25]*float(day.iloc[12,7])+parameters[26]*float(day.iloc[12,8])-Y_Train[index]
loss=loss + partsum * partsum
b_grad = b_grad + partsum
for k in range(27):
# print(day.iloc[dict[k],k % 9])
w_grad[k]=w_grad[k]+ partsum * float(day.iloc[dict[k],k % 9])
loss_history.append(loss/2)
#更新参数
b = b - learning_rate * b_grad
for t in range(27):
parameters[t] = parameters[t] - learning_rate * w_grad[t]
# print("jieshu ")

'''小批量梯度下降'''
dict={0:8,1:8,2:8,3:8,4:8,5:8,6:8,7:8,8:8,9:9,10:9,11:9,12:9,13:9,14:9,15:9,16:9,17:9,18:12,19:12,20:12,21:12,22:12,23:12,24:12,25:12,26:12}
iteration_count = 100 #迭代次数
learning_rate = 0.000001 #学习速率
b=0.0001 #初始化偏移项
parameters=[0.001]*27 #初始化27个参数
loss_history=[]
for i in range(iteration_count):
print("标记2",i,"\n")
loss=0
b_grad=0
w_grad=[0]*27
examples=list(randint(0, len(X_Train)-1) for index in range(100))
for j in range(100):
index=examples.pop()
day = X_Train[index]
partsum = b+parameters[0]*float(day.iloc[8,0])+parameters[1]*float(day.iloc[8,1])+parameters[2]*float(day.iloc[8,2])+parameters[3]*float(day.iloc[8,3])+parameters[4]*float(day.iloc[8,4])+parameters[5]*float(day.iloc[8,5])+parameters[6]*float(day.iloc[8,6])+parameters[7]*float(day.iloc[8,7])+parameters[8]*float(day.iloc[8,8])+parameters[9]*float(day.iloc[9,0])+parameters[10]*float(day.iloc[9,1])+parameters[11]*float(day.iloc[9,2])+parameters[12]*float(day.iloc[9,3])+parameters[13]*float(day.iloc[9,4])+parameters[14]*float(day.iloc[9,5])+parameters[15]*float(day.iloc[9,6])+parameters[16]*float(day.iloc[9,7])+parameters[17]*float(day.iloc[9,8])+parameters[18]*float(day.iloc[12,0])+parameters[19]*float(day.iloc[12,1])+parameters[20]*float(day.iloc[12,2])+parameters[21]*float(day.iloc[12,3])+parameters[22]*float(day.iloc[12,4])+parameters[23]*float(day.iloc[12,5])+parameters[24]*float(day.iloc[12,6])+parameters[25]*float(day.iloc[12,7])+parameters[26]*float(day.iloc[12,8])-Y_Train[index]
loss=loss + partsum * partsum
b_grad = b_grad + partsum
for k in range(27):
w_grad[k]=w_grad[k]+ partsum * float(day.iloc[dict[k],k % 9])
loss_history.append(loss/2)
#更新参数
b = b - learning_rate * b_grad
for t in range(27):
parameters[t] = parameters[t] - learning_rate * w_grad[t]

'''评价模型'''
data1 = pd.read_csv('test.csv')
del data1['id']
del data1['item']
# data.drop([0], axis=1)


X_Test=[]
ItemNum=18
for i in range(int(len(data1)/ItemNum)):
day = data1[i*ItemNum:(i+1)*ItemNum] #一天的观测数据
X_Test.append(day)
Y_Test=[]
data2 = pd.read_csv('answer.csv')
for i in range(len(data2)):
Y_Test.append(data2.iloc[i,1])
b=0.00371301266193
parameters=[-0.0024696993501677625, 0.0042664323568029619, -0.0086174899917209787, -0.017547874680980298, -0.01836289806786489, -0.0046459546176775678, -0.031425910733080147, 0.018037490234208024, 0.17448898242705385, 0.037982590870111861, 0.025666115101346722, 0.02295437149703404, 0.014272058968395849, 0.011573452230087483, 0.010984971346586308, -0.0061003639742210781, 0.19310213021199321, 0.45973205224805752, -0.0034995637680653086, 0.00094072189075279807, 0.00069329550591916357, 0.002966257320079194, 0.0050690506276038138, 0.007559004246038563, 0.013296350700555241, 0.027251049329127801, 0.039423988570899793]
Y_predict=[]
for i in range(len(X_Test)):
day=X_Test[i]
p=b+parameters[0]*float(day.iloc[8,0])+parameters[1]*float(day.iloc[8,1])+parameters[2]*float(day.iloc[8,2])+parameters[3]*float(day.iloc[8,3])+parameters[4]*float(day.iloc[8,4])+parameters[5]*float(day.iloc[8,5])+parameters[6]*float(day.iloc[8,6])+parameters[7]*float(day.iloc[8,7])+parameters[8]*float(day.iloc[8,8])+parameters[9]*float(day.iloc[9,0])+parameters[10]*float(day.iloc[9,1])+parameters[11]*float(day.iloc[9,2])+parameters[12]*float(day.iloc[9,3])+parameters[13]*float(day.iloc[9,4])+parameters[14]*float(day.iloc[9,5])+parameters[15]*float(day.iloc[9,6])+parameters[16]*float(day.iloc[9,7])+parameters[17]*float(day.iloc[9,8])+parameters[18]*float(day.iloc[12,0])+parameters[19]*float(day.iloc[12,1])+parameters[20]*float(day.iloc[12,2])+parameters[21]*float(day.iloc[12,3])+parameters[22]*float(day.iloc[12,4])+parameters[23]*float(day.iloc[12,5])+parameters[24]*float(day.iloc[12,6])+parameters[25]*float(day.iloc[12,7])+parameters[26]*float(day.iloc[12,8])
Y_predict.append(p)
def dev_degree(y_true,y_predict): #评价函数
sum=0
for i in range(len(y_predict)):
sum=sum+(y_true[i]-y_predict[i])*(y_true[i]-y_predict[i])
return sum/len(y_predict)
print(dev_degree(Y_Test,Y_predict))

def main():
test()

if __name__ == '__main__':
main()
15 changes: 15 additions & 0 deletions PM2.5_于金泽.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# 实现思路
- 读取文件,获取PM2.5的数据
- 9个小时为一组进行整合,存储在列表中
- 对x添加一列1起偏置作用
- 循环更新w参数
- 读取测试文件,计算预测值和真实值的偏差,计算平均误差

# 学习内容
- 实现了辛普森一家的分类,并且实现了处理视频
- 学习了LSTM和RNN的实现原理

# 学习预期
- 实现至少5个kaggle项目,使用pytorch实现
- 制作一个简易的chatbot
- 制作一个爬取公众号的scrapy项目,尽量实现界面化
48 changes: 48 additions & 0 deletions PM2.5_于金泽.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('train.csv', encoding='gbk')

pm2_5 = data[data['class']=='PM2.5'].ix[:, 3:]

tempxlist = []
tempylist = []
for i in range(15):
tempx = pm2_5.iloc[:, i:i+9]
tempx.columns = np.array(range(9))
tempy = pm2_5.iloc[:, i+9]
tempy.columns = ['1']
tempxlist.append(tempx)
tempylist.append(tempy)
xdata = pd.concat(tempxlist)
x = np.array(xdata, float)
ydata = pd.concat(tempylist)
y = np.array(ydata, float)

x = np.concatenate((np.ones((x.shape[0], 1)), x), axis=1)

w = np.zeros((len(x[0])))
lr = 10
iteration = 10000
s_grad = np.zeros(len(x[0]))
for i in range(iteration):
tem = np.dot(x, w)
loss = y - tem
grad = np.dot(x.transpose(), loss)*(-2)
s_grad += grad**2
ada = np.sqrt(s_grad)
w = w - lr*grad/ada

testdata = pd.read_csv('test.csv', encoding='gbk')
pm2_5_test = testdata[testdata['class']=='PM2.5'].ix[:, 2:]
x_test = np.array(pm2_5_test, float)
x_test_b = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1)
y_star = np.dot(x_test_b, w)
y_pre = pd.read_csv('sampleSubmission.csv', encoding='gbk')
y_pre.value = y_star

real = pd.read_csv('ans.csv', encoding='gbk')
erro = abs(y_pre.value - real.value).sum()/len(real.value)
print(erro)

41 changes: 41 additions & 0 deletions PM2.5_吴越.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# PM 2.5数据分析及预测及近期学习报告#

## 已完成 ##


- 对文章的进行关于PM2.5的分析预测思路有了大致的了解,并通过对csv文件的预处理,成功运行实现了代码,但对于核心梯度下降的代码部分仍有遗留问题



- 文章通过对18项空气质量指标,通过绘制各个空气指标前9小时与待预测PM2.5值的散点图,从中筛选出关键特征(Features),至于散点图的筛选标准,我个人觉得为该散点图的正相关程度,作者最终选择了S02,PM 10以及PM 2.5作为三个关键特征带入后续计算,而这三者图像均倾向于正相关趋势



- 三个特征项,每一项前9个小时的数据,因而有27个weight权重值进入梯度下降计算中
![](https://i.imgur.com/XI2Iq7W.png)

## 存疑(代码运行中存在问题) ##


- 博客作者最后评价体系给出了他所得到的最佳的27个weight权值值,而我通过运算的到的Loss值73.65,与博客最终提供的45.68有较大的出入



- 尝试自己迭代进行小批量梯度下降计算,得到的权值十分巨大,迭代部分核心代码有较大问题,仍在思考解决中(按照源码迭代出的权值,超级大,以至于最后的Loss值为Inf,不得不说,真是惊了)

## 学习进展 ##


- 看了两集李宏毅的学习教程,在宝可梦的谜一样世界里梦游,不得不说,这样的老师讲课,学生不会瞌睡倒是真的



- 通过Mooc入门学习了TensorFlow框架,目前学至神经网络的正则化,TensorFlow函数实在千姿百态,刚入门,实在记不太住具体用法和使用情景



- 完成辛普森一家角色人物分拣,第一次使用热门的数据平台Kaggle,用的很蹩脚,希望以后多多使用,增加熟悉



5/13/2019 12:22:49 AM
Loading