-
Notifications
You must be signed in to change notification settings - Fork 0
/
avicaching_data.py
460 lines (417 loc) · 17.7 KB
/
avicaching_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
#!/usr/bin/env python
# =============================================================================
# avicaching_data.py
# Author: Anmol Kabra -- github: @anmolkabra
# Project: Solving the Avicaching Game Faster and Better (Summer 2017)
# -----------------------------------------------------------------------------
# Purpose of the Script:
# Reads, writes, mutates and operates on avicaching data files. This module
# is imported in all models for reading and doing things.
# -----------------------------------------------------------------------------
# What do terms mean:
# X - visit densities before placing rewards (for Identification Problem)
# Y - visit densities after placing rewards (for Identification Problem)
# R - rewards matrix
# J - no. of locations
# T - no. of time units
# -----------------------------------------------------------------------------
# Required Dependencies/Software:
# - Python 2.x (obviously, Anaconda environment used originally)
# - NumPy
# -----------------------------------------------------------------------------
# Required Local Files/Data/Modules:
# - ./data/*
# =============================================================================
from __future__ import print_function
import numpy as np
def read_XYR_file(file_name, locs, T):
"""
Read the datafile containing X, Y, R information.
Args:
file_name -- (str) name of the file
locs -- (int) J
T -- (int) T
Returns:
3-tuple -- (tuple of NumPy ndarrays) X, Y, R
"""
X, Y, R = [], [], []
with open(file_name, "r") as xyrfile:
for idx, line in zip(xrange(T * 3), xyrfile):
# the line contains locs+ floats separated by spaces
line_vec = np.array(map(float, line.split()[:locs])) # only J cols
# if reading the first three lines, init the arrays
if idx == 0:
# X init
X = line_vec
elif idx == 1:
# Y init
Y = line_vec
elif idx == 2:
# R init
R = line_vec
else:
# reading 4+ lines
rem = idx % 3
if rem == 0:
# append X information
X = np.vstack([X, line_vec])
elif rem == 1:
# append Y information
Y = np.vstack([Y, line_vec])
else:
# append R information
R = np.vstack([R, line_vec])
return (X, Y, R)
def read_F_file(file_name, locs):
"""
Reads the csv file containing f information.
Args:
file_name -- (str) name of the file
locs -- (int) J
Returns:
NumPy ndarray -- f
"""
f = []
with open(file_name, "r") as fufile:
next(fufile) # skip header row of fufile
for idx, line in zip(xrange(locs), fufile):
# ignore last 3 cols which contain lat long information
line_vec = np.array(map(float, line.split(",")[:-3]))
if idx == 0:
# F init
f = line_vec
else:
# append F info
f = np.vstack([f, line_vec])
return f
def read_dist_file(file_name, locs):
"""
Reads the DIST file containing the distances between all locations.
Args:
file_name -- (str) name of the file
locs -- (int) J
Returns:
NumPy ndarray -- DIST
"""
DIST = []
with open(file_name, "r") as distfile:
for idx, line in zip(xrange(locs), distfile):
# only read J rows and cols
line_vec = np.array(map(float, line.split()))[:locs]
if idx == 0:
# DIST init
DIST = line_vec
else:
# append DIST info
DIST = np.vstack([DIST, line_vec])
return DIST
def combine_DIST_F(f, DIST, locs, numFeatures):
"""
Combines f and DIST as data preprocessing.
Args:
F -- (NumPy ndarray) f
DIST -- (NumPy ndarray) DIST
locs -- (int) J
numFeatures -- (int) `len(f[i]) + 1` (accounting for the distance element)
Returns:
NumPy ndarray -- represents the input dataset without the rewards.
"""
NN_in = np.empty([locs, locs, numFeatures], dtype=float)
for v in xrange(locs):
for u in xrange(locs):
NN_in[v][u][0] = DIST[v][u]
NN_in[v][u][1:] = f[u]
return NN_in
def save_rand_XYR(file_name, X, Y, R, J=116, T=173):
"""
Writes X, Y, R information to a file such that it is readable by read_XYR_file().
The dimensions of X, Y, R must be J x T each.
Args:
file_name -- (str) name of the file
X -- (NumPy ndarray) X
Y -- (NumPy ndarray) Y
R -- (NumPy ndarray) R
J -- (int) no. of locations (default=116)
T -- (int) no. of time units (default=173)
"""
# to intersperse XYR
XYR = np.empty([T * 3, J])
for t in xrange(T):
if t == 0:
XYR = X[t]
else:
XYR = np.vstack([XYR, X[t]])
XYR = np.vstack([XYR, Y[t]])
XYR = np.vstack([XYR, R[t]])
np.savetxt(file_name, XYR, fmt="%.8f", delimiter=" ")
def split_along_dim(M, num, dim):
"""
Shuffles and splits a matrix into 2 along a dimension at index, returning
two matrices.
Args:
M -- (NumPy ndarray) Matrix to be split
num -- (int) index to split at
dim -- (int) axis/dimension for splitting
Returns:
2 NumPy ndarrays -- Matrices after splitting
"""
return np.split(M, [num], axis=dim)
def read_weights_file(file_name, locs_in_file, locs, numFeatures):
"""
Reads the weights file and splits the saved data into 2 weights tensors,
as our models require.
Args:
file_name -- (str) name of the file
locs_in_file -- (int) no. of locations used in the weights file
locs -- (int) no. of locations required by the run specs
(<= locs_in_file). The function reads the file and ignores data
corresponding to locations > locs
numFeatures -- (int) no. of features in the dataset (the run spec
should not request something different than what is in the file).
This also determines the size of the weights tensors
Returns:
2-tuple of NumPy ndarrays -- w1 and w2
"""
data = np.loadtxt(file_name)
# data is an ndarray with the 1st part representing the 3d w1 (represented as
# 2d slices) and the 2nd part representing w2 - the last locs_in_file rows
w1, w2 = split_along_dim(data, len(data) - locs_in_file, dim=0)
# take out only locs slices. Since w1 is represented in 2d, this means
# taking out locs * numFeatures slices
w1, w2 = w1[:locs * numFeatures], w2[:locs]
w1 = w1.reshape((locs, numFeatures, numFeatures))
return (w1, w2)
def read_lat_long_from_Ffile(file_name, locs, lat_col=33, long_col=34):
"""
Reads the latitude and longitude from the file containing f information.
Args:
file_name -- (str) name of the file
locs -- (int) no. of locations. Also represents the length of lat and
long vectors
lat_col -- (int) col no. in the f file
long_col -- (int) col no. in the f file
Returns:
NumPy ndarray -- 2d matrix where the first col are latitudes and the
second col are longitudes
"""
lat_long = []
with open(file_name, "r") as fufile:
next(fufile) # skip header row of fufile
for idx, line in zip(xrange(locs), fufile):
# extract latitude and longitude. Since they are stored adjacent,
# just ignore other cols
line_vec = np.array(map(float, line.split(",")[lat_col:long_col + 1]))
if idx == 0:
# lat_long init
lat_long = line_vec
else:
# append lat_long info
lat_long = np.vstack([lat_long, line_vec])
return lat_long
def normalize(x, along_dim=None, using_max=True, offset_division=0.000001):
"""
Normalizes a tensor by dividing each element by the maximum or by the sum,
which are calculated along a dimension.
Args:
x -- (NumPy ndarray) matrix/tensor to be normalized
along_dim -- (int or None) If along_dim is an int, the max is
calculated along that dimension; if it's None,
whole x's max/sum is calculated (default=None)
using_max -- (bool) Normalize using max if True and sum if False
(default=True)
offset_division -- (float) safety mechanism to avoid division by zero
Returns:
NumPy ndarray -- Normalized matrix/tensor
"""
if using_max:
return x / (np.amax(x, axis=along_dim) + offset_division)
else:
return x / (np.sum(x, axis=along_dim, keepdims=True) + offset_division)
def make_rand_F_file(file_name, J):
"""
[Extremely bad code. A very bad example of coding style]
Creates and write random f file.
Args:
file_name -- (str) name of the file
J -- (int) J
"""
# num visits -- type random int
num_visits = np.floor(np.random.rand(J) * 1000)
# num species -- type random int
num_species = np.floor(np.random.rand(J) * 500)
# NLCD2011_FS_C11_375_PLAND -- type random float
NLCD2011_FS_C11_375_PLAND = np.random.rand(J) * 100
# NLCD2011_FS_C12_375_PLAND -- zeros
NLCD2011_FS_C12_375_PLAND = np.zeros(J)
# NLCD2011_FS_C21_375_PLAND -- type random float
NLCD2011_FS_C21_375_PLAND = np.random.rand(J) * 20
# NLCD2011_FS_C22_375_PLAND -- type random float
NLCD2011_FS_C22_375_PLAND = np.random.rand(J) * 50
# NLCD2011_FS_C23_375_PLAND -- type random float
NLCD2011_FS_C23_375_PLAND = np.random.rand(J) * 50
# NLCD2011_FS_C24_375_PLAND -- type random float
NLCD2011_FS_C24_375_PLAND = np.random.rand(J) * 20
# NLCD2011_FS_C31_375_PLAND -- type random float
NLCD2011_FS_C31_375_PLAND = np.random.rand(J) * 2
# NLCD2011_FS_C41_375_PLAND -- type random float
NLCD2011_FS_C41_375_PLAND = np.random.rand(J) * 100
# NLCD2011_FS_C42_375_PLAND -- type random float
NLCD2011_FS_C42_375_PLAND = np.random.rand(J) * 20
# NLCD2011_FS_C43_375_PLAND -- type random float
NLCD2011_FS_C43_375_PLAND = np.random.rand(J) * 20
# NLCD2011_FS_C52_375_PLAND -- type random float
NLCD2011_FS_C52_375_PLAND = np.random.rand(J) * 20
# NLCD2011_FS_C71_375_PLAND -- type random float
NLCD2011_FS_C71_375_PLAND = np.random.rand(J) * 2
# NLCD2011_FS_C81_375_PLAND -- type random float
NLCD2011_FS_C81_375_PLAND = np.random.rand(J) * 100
# NLCD2011_FS_C82_375_PLAND -- type random float
NLCD2011_FS_C82_375_PLAND = np.random.rand(J) * 80
# NLCD2011_FS_C90_375_PLAND -- type random float
NLCD2011_FS_C90_375_PLAND = np.random.rand(J) * 20
# NLCD2011_FS_C95_375_PLAND -- type random float
NLCD2011_FS_C95_375_PLAND = np.random.rand(J) * 2
# HOUSING_DENSITY -- type random float
HOUSING_DENSITY = np.random.rand(J) * 500
# HOUSING_PERCENT_VACANT -- type random float
HOUSING_PERCENT_VACANT = np.random.rand(J) * 0.1
# ELEV_GT -- type random int
ELEV_GT = np.floor(np.random.rand(J) * 500)
# DIST_FROM_FLOWING_FRESH -- type random int
DIST_FROM_FLOWING_FRESH = np.floor(np.random.rand(J) * 5)
# DIST_IN_FLOWING_FRESH -- type random int
DIST_IN_FLOWING_FRESH = np.floor(np.random.rand(J) * 10)
# DIST_FROM_STANDING_FRESH -- type random int
DIST_FROM_STANDING_FRESH = np.floor(np.random.rand(J) * 10)
# DIST_IN_STANDING_FRESH -- type random int
DIST_IN_STANDING_FRESH = np.floor(np.random.rand(J) * 10)
# DIST_FROM_WET_VEG_FRESH -- type random int
DIST_FROM_WET_VEG_FRESH = np.floor(np.random.rand(J) * 10)
# DIST_IN_WET_VEG_FRESH -- type random int
DIST_IN_WET_VEG_FRESH = np.floor(np.random.rand(J) * 10)
# DIST_FROM_FLOWING_BRACKISH -- type random int
DIST_FROM_FLOWING_BRACKISH = np.floor(np.random.rand(J) * 10)
# DIST_IN_FLOWING_BRACKISH -- type random int
DIST_IN_FLOWING_BRACKISH = np.floor(np.random.rand(J) * 10)
# DIST_FROM_STANDING_BRACKISH -- type random int
DIST_FROM_STANDING_BRACKISH = np.floor(np.random.rand(J) * 10)
# DIST_IN_STANDING_BRACKISH -- type random int
DIST_IN_STANDING_BRACKISH = np.floor(np.random.rand(J) * 10)
# DIST_FROM_WET_VEG_BRACKISH -- type random int
DIST_FROM_WET_VEG_BRACKISH = np.floor(np.random.rand(J) * 10)
# DIST_IN_WET_VEG_BRACKISH -- type random int
DIST_IN_WET_VEG_BRACKISH = np.floor(np.random.rand(J) * 10)
# LATITUDE -- type intersperse between 42 44
LATITUDE = np.linspace(42, 44, num=J)
# LONGITUDE --type intersperse between -75 -77
LONGITUDE = np.linspace(-75, -77, num=J)
# LOC_ID --random
LOC_ID = np.random.rand(J)
###
data = np.vstack([num_visits,
num_species,
NLCD2011_FS_C11_375_PLAND,
NLCD2011_FS_C12_375_PLAND,
NLCD2011_FS_C21_375_PLAND,
NLCD2011_FS_C22_375_PLAND,
NLCD2011_FS_C23_375_PLAND,
NLCD2011_FS_C24_375_PLAND,
NLCD2011_FS_C31_375_PLAND,
NLCD2011_FS_C41_375_PLAND,
NLCD2011_FS_C42_375_PLAND,
NLCD2011_FS_C43_375_PLAND,
NLCD2011_FS_C52_375_PLAND,
NLCD2011_FS_C71_375_PLAND,
NLCD2011_FS_C81_375_PLAND,
NLCD2011_FS_C82_375_PLAND,
NLCD2011_FS_C90_375_PLAND,
NLCD2011_FS_C95_375_PLAND,
HOUSING_DENSITY,
HOUSING_PERCENT_VACANT,
ELEV_GT,
DIST_FROM_FLOWING_FRESH,
DIST_IN_FLOWING_FRESH,
DIST_FROM_STANDING_FRESH,
DIST_IN_STANDING_FRESH,
DIST_FROM_WET_VEG_FRESH,
DIST_IN_WET_VEG_FRESH,
DIST_FROM_FLOWING_BRACKISH,
DIST_IN_FLOWING_BRACKISH,
DIST_FROM_STANDING_BRACKISH,
DIST_IN_STANDING_BRACKISH,
DIST_FROM_WET_VEG_BRACKISH,
DIST_IN_WET_VEG_BRACKISH,
LATITUDE,
LONGITUDE,
LOC_ID])
with open(file_name, "w") as f:
f.write("num visits,num species,NLCD2011_FS_C11_375_PLAND,NLCD2011_FS_C12_375_PLAND,NLCD2011_FS_C21_375_PLAND,NLCD2011_FS_C22_375_PLAND,NLCD2011_FS_C23_375_PLAND,NLCD2011_FS_C24_375_PLAND,NLCD2011_FS_C31_375_PLAND,NLCD2011_FS_C41_375_PLAND,NLCD2011_FS_C42_375_PLAND,NLCD2011_FS_C43_375_PLAND,NLCD2011_FS_C52_375_PLAND,NLCD2011_FS_C71_375_PLAND,NLCD2011_FS_C81_375_PLAND,NLCD2011_FS_C82_375_PLAND,NLCD2011_FS_C90_375_PLAND,NLCD2011_FS_C95_375_PLAND,HOUSING_DENSITY,HOUSING_PERCENT_VACANT,ELEV_GT,DIST_FROM_FLOWING_FRESH,DIST_IN_FLOWING_FRESH,DIST_FROM_STANDING_FRESH,DIST_IN_STANDING_FRESH,DIST_FROM_WET_VEG_FRESH,DIST_IN_WET_VEG_FRESH,DIST_FROM_FLOWING_BRACKISH,DIST_IN_FLOWING_BRACKISH,DIST_FROM_STANDING_BRACKISH,DIST_IN_STANDING_BRACKISH,DIST_FROM_WET_VEG_BRACKISH,DIST_IN_WET_VEG_BRACKISH,LATITUDE,LONGITUDE,LOC_ID\n")
np.savetxt(f, data.T, fmt="%.5f", delimiter=",")
def make_rand_DIST_file(file_name, J):
"""
Creates random DIST file. J x J random matrix max 100. diagonal elements 0
Args:
file_name -- (str) name of the file
J -- (int) J
"""
data = np.random.rand(J, J) * 100
data[np.diag_indices(J)] = 0.0 # distance[u][u] = 0
np.savetxt(file_name, data, fmt="%.6f", delimiter=" ")
def combine_lp_time_log(outfile, cpu_set, gpu_set, onlylp):
"""
Combines lp runtime logs of diff configs for tex-tikz input.
Args:
outfile -- (str) name of the output file (csv)
cpu_set -- (str) name of the file with CPU "set" info
gpu_set -- (str) name of the file with GPU "set" info
onlylp -- (str) name of the file with 'Only LP' info
"""
with open(cpu_set, "r") as c, open(gpu_set, "r") as g, open(onlylp) as o,\
open(outfile, "w") as out:
out.write("epoch,cpuset,gpuset,onlylp\n")
e = 1 # epoch
for cline, gline, oline in zip(c, g, o):
out.write("%d,%.6f,%.6f,%.6f\n" % \
(e, float(cline.split(",")[1]), float(gline.split(",")[1]),
float(oline.split(",")[1])))
e += 1
def combine_lp_time_log_threads(outfile, thread1, thread3, thread5, thread7):
"""
Combines lp runtime logs of different CPU-access restriction levels
for tex-tikz input.
Args:
outfile -- (str) name of the output file
thread1 -- (str) name of the file with info when the script was
restricted to 1 thread only
thread1 -- (str) name of the file with info when the script was
restricted to 3 thread only
thread5 -- (str) name of the file with info when the script was
restricted to 5 thread only
thread7 -- (str) name of the file with info when the script was
restricted to 7 thread only
"""
with open(thread1, "r") as t1, open(thread3, "r") as t3, open(thread5) as t5,\
open(thread7, "r") as t7, open(outfile, "w") as out:
out.write("epoch,t1,t3,t5,t7\n")
e = 1
for t1line, t3line, t5line, t7line in zip(t1,t3,t5,t7):
out.write("%d,%.6f,%.6f,%.6f,%.6f\n" % \
(e, float(t1line.split(",")[1]), float(t3line.split(",")[1]), \
float(t5line.split(",")[1]), float(t7line.split(",")[1])))
e += 1
def extract_python_processes(outfile, infile):
"""
Collects CPU and RAM usage of python processes from the logs.
Args:
outfile -- (str) name of the output file
infile -- (str) name of the input file, which contains the logs
"""
with open(infile, "r") as i, open(outfile, "w") as o:
e = 1 # epoch
o.write("epoch,cpu,mem\n")
for line in i:
if "python" in line:
# pid, mem, cpu, name -- order of stored info
el = line.split(" ")
o.write("%d,%.1f,%.1f\n" % (e, float(el[1]), float(el[2])))
e += 1