Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make generation of measurements file deterministic #149

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion create_measurements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
#


java --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CreateMeasurements $1
java --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CreateMeasurements $1 $2
2 changes: 1 addition & 1 deletion create_measurements2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
#


java --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CreateMeasurements2 $1
java --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CreateMeasurements2 $1 $2
2 changes: 1 addition & 1 deletion create_measurements3.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
#


java --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CreateMeasurements3 $1
java --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CreateMeasurements3 $1 $2
476 changes: 24 additions & 452 deletions src/main/java/dev/morling/onebrc/CreateMeasurements.java

Large diffs are not rendered by default.

555 changes: 55 additions & 500 deletions src/main/java/dev/morling/onebrc/CreateMeasurements2.java

Large diffs are not rendered by default.

149 changes: 27 additions & 122 deletions src/main/java/dev/morling/onebrc/CreateMeasurements3.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,152 +15,57 @@
*/
package dev.morling.onebrc;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.concurrent.ThreadLocalRandom;

public class CreateMeasurements3 {
import org.rschwietzke.FastRandom;

public static final int MAX_NAME_LEN = 100;
public static final int KEYSET_SIZE = 10_000;
public class CreateMeasurements3 {

public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.out.println("Usage: create_measurements3.sh <number of records to create>");
if (args.length < 1) {
System.out.println("Usage: create_measurements3.sh <number of records to create> [seed]");
System.exit(1);
}

int size = 0;
try {
size = Integer.parseInt(args[0]);
}
catch (NumberFormatException e) {
System.out.println("Invalid value for <number of records to create>");
System.out.println("Usage: create_measurements3.sh <number of records to create>");
System.out.println("Usage: create_measurements3.sh <number of records to create> [seed]");
System.exit(1);
}
final var weatherStations = generateWeatherStations();

// Default seed is 1brc1brc converted to hexadecimal
long seed = 0x3162726331627263L;
if (args.length == 2) {
try {
seed = Long.parseLong(args[1]);
}
catch (NumberFormatException e) {
System.out.println("Invalid value for [seed]");
System.out.println("Usage: CreateMeasurements2 <number of records to create> [seed]");
System.exit(1);
}
}

final var weatherStations = WeatherStationFactory.getWeatherStationsList(seed);
final var start = System.currentTimeMillis();
final var rnd = ThreadLocalRandom.current();
final var rnd = new FastRandom(seed);
try (var out = new BufferedWriter(new FileWriter("measurements.txt"))) {
for (int i = 1; i <= size; i++) {
var station = weatherStations.get(rnd.nextInt(KEYSET_SIZE));
double temp = rnd.nextGaussian(station.avgTemp, 7.0);
out.write(station.name);
var station = weatherStations.get(rnd.nextInt(weatherStations.size()));
double temp = station.measurement();
out.write(station.id);
out.write(';');
out.write(Double.toString(Math.round(temp * 10.0) / 10.0));
out.newLine();
out.write(Double.toString(temp));
out.write('\n');
if (i % 50_000_000 == 0) {
System.out.printf("Wrote %,d measurements in %,d ms%n", i, System.currentTimeMillis() - start);
}
}
}
}

record WeatherStation(String name, float avgTemp) {
}

private static ArrayList<WeatherStation> generateWeatherStations() throws Exception {
// Use a public list of city names and concatenate them all into a long string,
// which we'll use as a "source of city name randomness"
var bigName = new StringBuilder(1 << 20);
try (var rows = new BufferedReader(new FileReader("data/weather_stations.csv"));) {
skipComments(rows);
while (true) {
var row = rows.readLine();
if (row == null) {
break;
}
bigName.append(row, 0, row.indexOf(';'));
}
}
final var weatherStations = new ArrayList<WeatherStation>();
final var names = new HashSet<String>();
var minLen = Integer.MAX_VALUE;
var maxLen = Integer.MIN_VALUE;
try (var rows = new BufferedReader(new FileReader("data/weather_stations.csv"))) {
skipComments(rows);
final var nameSource = new StringReader(bigName.toString());
final var buf = new char[MAX_NAME_LEN];
final var rnd = ThreadLocalRandom.current();
final double yOffset = 4;
final double factor = 2500;
final double xOffset = 0.372;
final double power = 7;
for (int i = 0; i < KEYSET_SIZE; i++) {
var row = rows.readLine();
if (row == null) {
break;
}
// Use a 7th-order curve to simulate the name length distribution.
// It gives us mostly short names, but with large outliers.
var nameLen = (int) (yOffset + factor * Math.pow(rnd.nextDouble() - xOffset, power));
var count = nameSource.read(buf, 0, nameLen);
if (count == -1) {
throw new Exception("Name source exhausted");
}
var nameBuf = new StringBuilder(nameLen);
nameBuf.append(buf, 0, nameLen);
if (Character.isWhitespace(nameBuf.charAt(0))) {
nameBuf.setCharAt(0, readNonSpace(nameSource));
}
if (Character.isWhitespace(nameBuf.charAt(nameBuf.length() - 1))) {
nameBuf.setCharAt(nameBuf.length() - 1, readNonSpace(nameSource));
}
var name = nameBuf.toString();
while (names.contains(name)) {
nameBuf.setCharAt(rnd.nextInt(nameBuf.length()), readNonSpace(nameSource));
name = nameBuf.toString();
}
int actualLen;
while (true) {
actualLen = name.getBytes(StandardCharsets.UTF_8).length;
if (actualLen <= 100) {
break;
}
nameBuf.deleteCharAt(nameBuf.length() - 1);
if (Character.isWhitespace(nameBuf.charAt(nameBuf.length() - 1))) {
nameBuf.setCharAt(nameBuf.length() - 1, readNonSpace(nameSource));
}
name = nameBuf.toString();
}
if (name.indexOf(';') != -1) {
throw new Exception("Station name contains a semicolon!");
}
names.add(name);
minLen = Integer.min(minLen, actualLen);
maxLen = Integer.max(maxLen, actualLen);
var lat = Float.parseFloat(row.substring(row.indexOf(';') + 1));
// Guesstimate mean temperature using cosine of latitude
var avgTemp = (float) (30 * Math.cos(Math.toRadians(lat))) - 10;
weatherStations.add(new WeatherStation(name, avgTemp));
}
}
System.out.format("Generated %,d station names with length from %,d to %,d%n", KEYSET_SIZE, minLen, maxLen);
return weatherStations;
}

private static void skipComments(BufferedReader rows) throws IOException {
while (rows.readLine().startsWith("#")) {
}
}

private static char readNonSpace(StringReader nameSource) throws IOException {
while (true) {
var n = nameSource.read();
if (n == -1) {
throw new IOException("Name source exhausted");
}
var ch = (char) n;
if (ch != ' ') {
return ch;
}
}
}
}
75 changes: 75 additions & 0 deletions src/main/java/dev/morling/onebrc/WeatherStation.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Copyright 2023 The original authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dev.morling.onebrc;

import org.rschwietzke.FastRandom;
import org.rschwietzke.CheaperCharBuffer;

public class WeatherStation {
final static char[] NUMBERS = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' };

public final String id;
public final char[] idChars;
public final double meanTemperature;

private final FastRandom fastRandom;

public WeatherStation(long seed, String id, double meanTemperature) {
this.id = id;
this.meanTemperature = meanTemperature;
this.fastRandom = new FastRandom(((long) id.hashCode()) ^ seed);
this.idChars = id.toCharArray();
}

private int nextWholePart() {
// fake -10.9 to +10.9 variance without double operations and rounding
// gives us -10 to +10
return (int) this.meanTemperature + (this.fastRandom.nextInt(21) - 10);
}

private int nextDecimalPart() {
return this.fastRandom.nextInt(10);
}

public double measurement() {
double whole = (double) this.nextWholePart();
double decimal = (double) this.nextDecimalPart();

if (whole < 0) {
return whole - (decimal / 10.0);
}
else {
return whole + (decimal / 10.0);
}
}

/**
* We write out data into the buffer to avoid string conversion
* We also no longer use double and gaussian, because for our
* purpose, the fake numbers here will do it. Less
*
* @param buffer the buffer to append to
*/
public void measurement(final CheaperCharBuffer buffer) {
int m = this.nextWholePart();
// gives us a decimal digit 0 to 9 as char
char d = NUMBERS[this.nextDecimalPart()];

// just append, only one number has to be converted and we can do
// better... if we watn
buffer.append(String.valueOf(m)).append('.').append(d);
}
}
Loading