forked from aws/amazon-sagemaker-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_synthetic_housing_data.py
103 lines (86 loc) · 2.33 KB
/
generate_synthetic_housing_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from random import choice
import numpy as np
import pandas as pd
NUM_HOUSES_PER_LOCATION = 1000
LOCATIONS = [
"NewYork_NY",
"LosAngeles_CA",
"Chicago_IL",
"Houston_TX",
"Dallas_TX",
"Phoenix_AZ",
"Philadelphia_PA",
"SanAntonio_TX",
"SanDiego_CA",
"SanFrancisco_CA",
]
MAX_YEAR = 2019
def generate_price(house):
"""Generate price based on features of the house"""
if house["FRONT_PORCH"] == "y":
garage = 1
else:
garage = 0
if house["FRONT_PORCH"] == "y":
front_porch = 1
else:
front_porch = 0
price = int(
150 * house["SQUARE_FEET"]
+ 10000 * house["NUM_BEDROOMS"]
+ 15000 * house["NUM_BATHROOMS"]
+ 15000 * house["LOT_ACRES"]
+ 10000 * garage
+ 10000 * front_porch
+ 15000 * house["GARAGE_SPACES"]
- 5000 * (MAX_YEAR - house["YEAR_BUILT"])
)
return price
def generate_yes_no():
"""Generate values (y/n) for categorical features"""
answer = choice([1, 0])
return answer
def generate_random_house():
"""Generate a row of data (single house information)"""
house = {
"SQUARE_FEET": np.random.normal(3000, 750),
"NUM_BEDROOMS": np.random.randint(2, 7),
"NUM_BATHROOMS": np.random.randint(2, 7) / 2,
"LOT_ACRES": round(np.random.normal(1.0, 0.25), 2),
"GARAGE_SPACES": np.random.randint(0, 4),
"YEAR_BUILT": min(MAX_YEAR, int(np.random.normal(1995, 10))),
"FRONT_PORCH": generate_yes_no(),
"DECK": generate_yes_no(),
}
price = generate_price(house)
return [
house["YEAR_BUILT"],
house["SQUARE_FEET"],
house["NUM_BEDROOMS"],
house["NUM_BATHROOMS"],
house["LOT_ACRES"],
house["GARAGE_SPACES"],
house["FRONT_PORCH"],
house["DECK"],
price,
]
def generate_houses(num_houses):
"""Generate housing dataset"""
house_list = []
for _ in range(num_houses):
house_list.append(generate_random_house())
df = pd.DataFrame(
house_list,
columns=[
"YEAR_BUILT",
"SQUARE_FEET",
"NUM_BEDROOMS",
"NUM_BATHROOMS",
"LOT_ACRES",
"GARAGE_SPACES",
"FRONT_PORCH",
"DECK",
"PRICE",
],
)
return df