diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..006849d6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.DS_Store* +.vagrant diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 00000000..a00e2cdf --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,3 @@ +Version 0.1.0 (2016-03-01) +-------------------------- +Initial release diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..28487835 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM java:7 +MAINTAINER Snowplow Support + +EXPOSE 8080 +EXPOSE 5601 +EXPOSE 9200 + +ADD resources/kibana/kibana4_init /etc/init.d/kibana4_init +ADD resources/configs /home/ubuntu/snowplow/configs +ADD resources/elasticsearch /home/ubuntu/snowplow/elasticsearch +ADD scripts /home/ubuntu/snowplow/scripts + +RUN /home/ubuntu/snowplow/scripts/1_setup_docker.sh +RUN rm -rf /home/ubuntu/snowplow/staging + +CMD /home/ubuntu/snowplow/scripts/2_run_docker.sh diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..5c304d1a --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Packerfile.json b/Packerfile.json new file mode 100644 index 00000000..cabff773 --- /dev/null +++ b/Packerfile.json @@ -0,0 +1,71 @@ +{ + "builders": [ + { + "type": "amazon-ebs", + "region": "eu-west-1", + "source_ami": "ami-65cb5a12", + "instance_type": "t2.small", + "ssh_username": "ubuntu", + "ami_name": "snowplow-mini-0.1.0-{{ timestamp }}", + "ami_groups": [ "all" ], + "ami_regions": [ "us-east-1", "us-west-2", "us-west-1", "eu-central-1", "ap-southeast-1", "ap-northeast-1", "ap-southeast-2", "sa-east-1" ], + "ami_description": "SnowplowMini - The Snowplow Pipeline in a box", + "tags": { + "OS_Version": "Ubuntu-12.04", + "Release": "0.1.0" + } + } + ], + + "provisioners": [ + { + "type": "shell", + "inline": [ + "mkdir -p /home/ubuntu/snowplow/configs", + "mkdir -p /home/ubuntu/snowplow/elasticsearch", + "mkdir -p /home/ubuntu/snowplow/scripts" + ] + }, + { + "type": "file", + "source": "resources/kibana/kibana4_init", + "destination": "/home/ubuntu/kibana4_init" + }, + { + "type": "shell", + "inline": [ + "sudo cp /home/ubuntu/kibana4_init /etc/init.d", + "sudo chmod 0755 /etc/init.d/kibana4_init", + "sudo update-rc.d kibana4_init defaults 95 10" + ] + }, + { + "type": "file", + "source": "resources/configs", + "destination": "/home/ubuntu/snowplow" + }, + { + "type": "file", + "source": "resources/elasticsearch", + "destination": "/home/ubuntu/snowplow" + }, + { + "type": "file", + "source": "scripts", + "destination": "/home/ubuntu/snowplow" + }, + { + "type": "shell", + "inline": [ + "sudo cp /home/ubuntu/snowplow/scripts/2_run_packer /etc/init.d", + "sudo chmod 0755 /etc/init.d/2_run_packer", + "sudo update-rc.d 2_run_packer defaults 95 10" + ] + }, + { + "type": "shell", + "script": "scripts/1_setup_packer.sh", + "execute_command": "chmod +x {{ .Path }}; sh '{{ .Path }}'" + } + ] +} diff --git a/README.md b/README.md new file mode 100644 index 00000000..d4bc0dd2 --- /dev/null +++ b/README.md @@ -0,0 +1,22 @@ +# Snowplow-Mini + +An easily-deployable, single instance version of Snowplow that serves three use cases: + +1. Gives a Snowplow consumer (e.g. an analyst / data team / marketing team) a way to quickly understand what Snowplow "does" i.e. what you put it at one end and take out of the other +2. Gives developers new to Snowplow an easy way to start with Snowplow and understand how the different pieces fit together +3. Gives people running Snowplow a quick way to debug tracker updates (because they can ) + +## v1 + +The initial version of Snowplow-mini has only a limited subset of functionality: + +1. Data can be tracked in real time and loaded into Elasticsearch, where it can be queried (either directly or via Kibana) +2. Loading data into Redshift is not supported. (So this does not yet give analysts / data teams a good idea to understand what Snowplow "does") +3. No UI is provided to indicate what is happening with each of the different subsystems (collector, enrich etc.), so this does not provide developers a very good way of understanding how the different Snowplow subsystems work with one another +4. No validation is perfomed on the data, so this is not especially useful for Snowplow users who want to debug instrumentations of e.g. new trackers prior to pushing them live on Snowplow proper + +## Documentation + +1. [Quick start guide] [get-started-guide] + +[get-started-guide]: https://github.com/snowplow/snowplow-mini/wiki/Quickstart-guide diff --git a/VERSION b/VERSION new file mode 100644 index 00000000..6c6aa7cb --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.1.0 \ No newline at end of file diff --git a/Vagrantfile b/Vagrantfile new file mode 100644 index 00000000..f58d9176 --- /dev/null +++ b/Vagrantfile @@ -0,0 +1,33 @@ +Vagrant.configure("2") do |config| + + config.vm.box = "ubuntu/trusty64" + config.vm.hostname = "snowplow-mini" + config.ssh.forward_agent = true + + # Use NFS for shared folders for better performance + config.vm.network :private_network, ip: '192.168.50.50' # Uncomment to use NFS + config.vm.synced_folder '.', '/vagrant', nfs: true # Uncomment to use NFS + + config.vm.network "forwarded_port", guest: 9200, host: 9200 + config.vm.network "forwarded_port", guest: 5601, host: 5601 + config.vm.network "forwarded_port", guest: 8080, host: 8080 + + config.vm.provider :virtualbox do |vb| + vb.name = Dir.pwd().split("/")[-1] + "-" + Time.now.to_f.to_i.to_s + vb.customize ["modifyvm", :id, "--natdnshostresolver1", "on"] + vb.customize [ "guestproperty", "set", :id, "--timesync-threshold", 10000 ] + # Docker is quite hungry + vb.memory = 2048 + vb.cpus = 4 + end + + config.vm.provision :shell do |sh| + sh.path = "vagrant/up.bash" + end + + # Requires Vagrant 1.7.0+ + config.push.define "publish", strategy: "local-exec" do |push| + push.script = "vagrant/push.bash" + end + +end diff --git a/examples/example-web-page-1.html b/examples/example-web-page-1.html new file mode 100644 index 00000000..a920c63d --- /dev/null +++ b/examples/example-web-page-1.html @@ -0,0 +1,152 @@ + + + + + Example events for testing Snowplow mini + + + + + + + + + + + +

Send example events into Snowplow-mini

+ +

Note: before loading this page in your browser and firing events from it into Snowplow-mini, please make sure to replace all references to 'http://ec2-54-208-64-111.compute-1.amazonaws.com:8080' with your snowplow-mini public DNS.

+ +

Press the buttons below to trigger individual tracking events:
+
+
+
+ +

+ Link +
+ Ignored link + + + + \ No newline at end of file diff --git a/resources/configs/default-iglu-resolver.json b/resources/configs/default-iglu-resolver.json new file mode 100644 index 00000000..8ce1f6ac --- /dev/null +++ b/resources/configs/default-iglu-resolver.json @@ -0,0 +1,18 @@ +{ + "schema": "iglu:com.snowplowanalytics.iglu/resolver-config/jsonschema/1-0-0", + "data": { + "cacheSize": 500, + "repositories": [ + { + "name": "Iglu Central", + "priority": 0, + "vendorPrefixes": ["com.snowplowanalytics"], + "connection": { + "http": { + "uri": "http://iglucentral.com" + } + } + } + ] + } +} diff --git a/resources/configs/iglu-resolver.json b/resources/configs/iglu-resolver.json new file mode 100644 index 00000000..66132bad --- /dev/null +++ b/resources/configs/iglu-resolver.json @@ -0,0 +1,32 @@ +{ + "schema": "iglu:com.snowplowanalytics.iglu/resolver-config/jsonschema/1-0-0", + "data": { + "cacheSize": 500, + "repositories": [ + { + "name": "Iglu Central", + "priority": 1, + "vendorPrefixes": [ + "com.snowplowanalytics" + ], + "connection": { + "http": { + "uri": "http://iglucentral.com" + } + } + }, + { + "name": "Iglu snowplow mini usre", + "priority": 0, + "vendorPrefixes": [ + "com.snowplow-mini-user" + ], + "connection": { + "http": { + "uri": "https://s3.amazonaws.com/bucket-name-here" + } + } + } + ] + } +} diff --git a/resources/configs/kinesis-elasticsearch-sink-good.hocon b/resources/configs/kinesis-elasticsearch-sink-good.hocon new file mode 100644 index 00000000..a67d4e8b --- /dev/null +++ b/resources/configs/kinesis-elasticsearch-sink-good.hocon @@ -0,0 +1,77 @@ +# Default configuration for kinesis-elasticsearch-sink + +sink { + + # Sources currently supported are: + # 'kinesis' for reading records from a Kinesis stream + # 'stdin' for reading unencoded tab-separated events from stdin + # If set to "stdin", JSON documents will not be sent to Elasticsearch + # but will be written to stdout. + source = "stdin" + + # Sinks currently supported are: + # 'elasticsearch-kinesis' for writing good records to Elasticsearch and bad records to Kinesis + # 'stdouterr' for writing good records to stdout and bad records to stderr + sink = "elasticsearch-stderr" + + # The following are used to authenticate for the Amazon Kinesis sink. + # + # If both are set to 'default', the default provider chain is used + # (see http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html) + # + # If both are set to 'iam', use AWS IAM Roles to provision credentials. + # + # If both are set to 'env', use environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY + aws { + access-key: "" + secret-key: "" + } + + kinesis { + + in { + stream-name: "" # Kinesis stream name + + # "good" for a stream of successfully enriched events + # "bad" for a stream of bad events + stream-type: "good" + + # LATEST: most recent data. + # TRIM_HORIZON: oldest available data. + # Note: This only affects the first run of this application + # on a stream. + initial-position: "TRIM_HORIZON" + } + + out { + # Stream for enriched events which are rejected by Elasticsearch + stream-name: "" + shards: 1 + } + + region: "" + + # "app-name" is used for a DynamoDB table to maintain stream state. + # You can set it automatically using: "SnowplowElasticsearchSink-$\\{connector.kinesis.in.stream-name\\}" + app-name: "" + } + + elasticsearch { + cluster-name: "elasticsearch" + endpoint: "localhost" + max-timeout: "10000" + index: "good" # Elasticsearch index name + type: "good" # Elasticsearch type name + } + + # Events are accumulated in a buffer before being sent to Elasticsearch. + # The buffer is emptied whenever: + # - the combined size of the stored records exceeds byte-limit or + # - the number of stored records exceeds record-limit or + # - the time in milliseconds since it was last emptied exceeds time-limit + buffer { + byte-limit: 5242880 + record-limit: 10000 + time-limit: 60000 + } +} diff --git a/resources/configs/scala-kinesis-enrich.hocon b/resources/configs/scala-kinesis-enrich.hocon new file mode 100644 index 00000000..616ddb42 --- /dev/null +++ b/resources/configs/scala-kinesis-enrich.hocon @@ -0,0 +1,70 @@ +# Default Configuration for Scala Kinesis Enrich. + +enrich { + # Sources currently supported are: + # 'kinesis' for reading Thrift-serialized records from a Kinesis stream + # 'stdin' for reading Base64-encoded Thrift-serialized records from stdin + source = "stdin" + + # Sinks currently supported are: + # 'kinesis' for writing enriched events to one Kinesis stream and invalid events to another. + # 'stdouterr' for writing enriched events to stdout and invalid events to stderr. + # Using "sbt assembly" and "java -jar" is recommended to disable sbt + # logging. + sink = "stdouterr" + + # AWS credentials + # + # If both are set to 'cpf', a properties file on the classpath is used. + # http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/ClasspathPropertiesFileCredentialsProvider.html + # + # If both are set to 'iam', use AWS IAM Roles to provision credentials. + # + # If both are set to 'env', use environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY + aws { + access-key: "" + secret-key: "" + } + + streams { + in: { + raw: "" + + # After enrichment, are accumulated in a buffer before being sent to Kinesis. + # The buffer is emptied whenever: + # - the number of stored records reaches record-limit or + # - the combined size of the stored records reaches byte-limit or + # - the time in milliseconds since it was last emptied exceeds time-limit when + # a new event enters the buffer + buffer: { + byte-limit: 4500000 + record-limit: 500 + time-limit: 5000 + } + } + + out: { + enriched: "" + bad: "" + + # Minimum and maximum backoff periods + # - Units: Milliseconds + backoffPolicy: { + minBackoff: 50 + maxBackoff: 500 + } + } + + # "app-name" is used for a DynamoDB table to maintain stream state. + # You can set it automatically using: "SnowplowKinesisEnrich-$\\{enrich.streams.in.raw\\}" + app-name: "" + + # LATEST: most recent data. + # TRIM_HORIZON: oldest available data. + # Note: This only effects the first run of this application + # on a stream. + initial-position = "TRIM_HORIZON" + + region: "" + } +} diff --git a/resources/configs/scala-stream-collector.hocon b/resources/configs/scala-stream-collector.hocon new file mode 100644 index 00000000..483ce88f --- /dev/null +++ b/resources/configs/scala-stream-collector.hocon @@ -0,0 +1,128 @@ +# Copyright (c) 2013-2014 Snowplow Analytics Ltd. All rights reserved. +# +# This program is licensed to you under the Apache License Version 2.0, and +# you may not use this file except in compliance with the Apache License +# Version 2.0. You may obtain a copy of the Apache License Version 2.0 at +# http://www.apache.org/licenses/LICENSE-2.0. +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the Apache License Version 2.0 is distributed on an "AS +# IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the Apache License Version 2.0 for the specific language +# governing permissions and limitations there under. + +# This file (application.conf.example) contains a template with +# configuration options for the Scala Stream Collector. +# +# To use, copy this to 'application.conf' and modify the configuration options. + +# 'collector' contains configuration options for the main Scala collector. +collector { + # The collector runs as a web service specified on the following + # interface and port. + interface = "0.0.0.0" + port = 8080 + + # Production mode disables additional services helpful for configuring and + # initializing the collector, such as a path '/dump' to view all + # records stored in the current stream. + production = true + + # Configure the P3P policy header. + p3p { + policyref = "/w3c/p3p.xml" + CP = "NOI DSP COR NID PSA OUR IND COM NAV STA" + } + + # The collector returns a cookie to clients for user identification + # with the following domain and expiration. + cookie { + # Set to 0 to disable the cookie + expiration = 365 days + # The domain is optional and will make the cookie accessible to other + # applications on the domain. Comment out this line to tie cookies to + # the collector's full domain + domain = "" + } + + # The collector has a configurable sink for storing data in + # different formats for the enrichment process. + sink { + # Sinks currently supported are: + # 'kinesis' for writing Thrift-serialized records to a Kinesis stream + # 'stdout' for writing Base64-encoded Thrift-serialized records to stdout + # Recommended settings for 'stdout' so each line printed to stdout + # is a serialized record are: + # 1. Setting 'akka.loglevel = OFF' and 'akka.loggers = []' + # to disable all logging. + # 2. Using 'sbt assembly' and 'java -jar ...' to disable + # sbt logging. + enabled = "stdout" + + kinesis { + thread-pool-size: 10 # Thread pool size for Kinesis API requests + + # The following are used to authenticate for the Amazon Kinesis sink. + # + # If both are set to 'cpf', a properties file on the classpath is used. + # http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/ClasspathPropertiesFileCredentialsProvider.html + # + # If both are set to 'iam', use AWS IAM Roles to provision credentials. + # + # If both are set to 'env', use environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY + aws { + access-key: "" + secret-key: "" + } + + # Data will be stored in the following stream. + stream { + region: "" + good: "" + bad: "" + } + + # Minimum and maximum backoff periods + backoffPolicy: { + minBackoff: 3000 # 3 seconds + maxBackoff: 600000 # 5 minutes + } + + # Incoming events are stored in a buffer before being sent to Kinesis. + # The buffer is emptied whenever: + # - the number of stored records reaches record-limit or + # - the combined size of the stored records reaches byte-limit or + # - the time in milliseconds since the buffer was last emptied reaches time-limit + buffer { + byte-limit: 4000000 + record-limit: 500 + time-limit: 5000 + } + } + } +} + +# Akka has a variety of possible configuration options defined at +# http://doc.akka.io/docs/akka/2.2.3/general/configuration.html. +akka { + loglevel = OFF # 'OFF' for no logging, 'DEBUG' for all logging. + loggers = ["akka.event.slf4j.Slf4jLogger"] +} + +# spray-can is the server the Stream collector uses and has configurable +# options defined at +# https://github.com/spray/spray/blob/master/spray-can/src/main/resources/reference.conf +spray.can.server { + # To obtain the hostname in the collector, the 'remote-address' header + # should be set. By default, this is disabled, and enabling it + # adds the 'Remote-Address' header to every request automatically. + remote-address-header = on + + uri-parsing-mode = relaxed + raw-request-uri-header = on + + # Define the maximum request length (the default is 2048) + parsing { + max-uri-length = 32768 + } +} diff --git a/resources/elasticsearch/bad-mapping.json b/resources/elasticsearch/bad-mapping.json new file mode 100644 index 00000000..b83c8454 --- /dev/null +++ b/resources/elasticsearch/bad-mapping.json @@ -0,0 +1,41 @@ +{ + "settings": { + "analysis": { + "analyzer": { + "default": { + "type": "keyword" + } + } + }, + "index" : { + "number_of_replicas" : "0", + "number_of_shards" : "2" + } + }, + "mappings": { + "bad": { + "_timestamp" : { + "enabled" : "yes", + "path" : "failure_tstamp" + }, + "_ttl": { + "enabled": true, + "default": "604800000" + }, + "properties": { + "errors": { + "type": "string", + "analyzer": "standard" + }, + "failure_tstamp": { + "type": "date", + "format": "dateOptionalTime" + }, + "line": { + "type": "string", + "analyzer": "standard" + } + } + } + } +} diff --git a/resources/elasticsearch/good-mapping.json b/resources/elasticsearch/good-mapping.json new file mode 100644 index 00000000..1102d531 --- /dev/null +++ b/resources/elasticsearch/good-mapping.json @@ -0,0 +1,328 @@ +{ + "settings": { + "analysis": { + "analyzer": { + "default": { + "type": "keyword" + } + } + }, + "index" : { + "number_of_replicas" : "0", + "number_of_shards" : "2" + } + }, + "mappings": { + "good": { + "_timestamp" : { + "enabled" : "yes", + "path" : "collector_tstamp" + }, + "_ttl": { + "enabled": true, + "default": "604800000" + }, + "properties": { + "app_id": { + "type": "string", + "index": "not_analyzed" + }, + "br_colordepth": { + "type": "string", + "index": "not_analyzed" + }, + "br_cookies": { + "type": "boolean" + }, + "br_family": { + "type": "string", + "index": "not_analyzed" + }, + "br_features_director": { + "type": "boolean" + }, + "br_features_flash": { + "type": "boolean" + }, + "br_features_gears": { + "type": "boolean" + }, + "br_features_java": { + "type": "boolean" + }, + "br_features_pdf": { + "type": "boolean" + }, + "br_features_quicktime": { + "type": "boolean" + }, + "br_features_realplayer": { + "type": "boolean" + }, + "br_features_silverlight": { + "type": "boolean" + }, + "br_features_windowsmedia": { + "type": "boolean" + }, + "br_lang": { + "type": "string", + "index": "not_analyzed" + }, + "br_name": { + "type": "string", + "index": "not_analyzed" + }, + "br_renderengine": { + "type": "string", + "index": "not_analyzed" + }, + "br_type": { + "type": "string", + "index": "not_analyzed" + }, + "br_version": { + "type": "string", + "index": "not_analyzed" + }, + "br_viewheight": { + "type": "long" + }, + "br_viewwidth": { + "type": "long" + }, + "collector_tstamp": { + "type": "date", + "format": "dateOptionalTime" + }, + "doc_charset": { + "type": "string", + "index": "not_analyzed" + }, + "doc_height": { + "type": "long" + }, + "doc_width": { + "type": "long" + }, + "domain_sessionid": { + "type": "string", + "index": "not_analyzed" + }, + "domain_sessionidx": { + "type": "long" + }, + "domain_userid": { + "type": "string", + "index": "not_analyzed" + }, + "dvce_ismobile": { + "type": "boolean" + }, + "dvce_screenheight": { + "type": "long" + }, + "dvce_screenwidth": { + "type": "long" + }, + "dvce_sent_tstamp": { + "type": "date", + "format": "dateOptionalTime" + }, + "dvce_tstamp": { + "type": "date", + "format": "dateOptionalTime" + }, + "dvce_type": { + "type": "string", + "index": "not_analyzed" + }, + "etl_tstamp": { + "type": "date", + "format": "dateOptionalTime" + }, + "event": { + "type": "string", + "index": "not_analyzed" + }, + "event_id": { + "type": "string", + "index": "not_analyzed" + }, + "geo_location": { + "type": "geo_point" + }, + "mkt_campaign": { + "type": "string", + "index": "not_analyzed" + }, + "mkt_content": { + "type": "string", + "index": "not_analyzed" + }, + "mkt_medium": { + "type": "string", + "index": "not_analyzed" + }, + "mkt_source": { + "type": "string", + "index": "not_analyzed" + }, + "mkt_term": { + "type": "string", + "index": "not_analyzed" + }, + "name_tracker": { + "type": "string", + "index": "not_analyzed" + }, + "network_userid": { + "type": "string", + "index": "not_analyzed" + }, + "os_family": { + "type": "string", + "index": "not_analyzed" + }, + "os_manufacturer": { + "type": "string", + "index": "not_analyzed" + }, + "os_name": { + "type": "string", + "index": "not_analyzed" + }, + "os_timezone": { + "type": "string", + "index": "not_analyzed" + }, + "page_referrer": { + "type": "string", + "index": "not_analyzed" + }, + "page_title": { + "type": "string", + "index": "not_analyzed" + }, + "page_url": { + "type": "string", + "index": "not_analyzed" + }, + "page_urlfragment": { + "type": "string", + "index": "not_analyzed" + }, + "page_urlhost": { + "type": "string", + "index": "not_analyzed" + }, + "page_urlpath": { + "type": "string", + "index": "not_analyzed" + }, + "page_urlport": { + "type": "long" + }, + "page_urlquery": { + "type": "string", + "index": "not_analyzed" + }, + "page_urlscheme": { + "type": "string", + "index": "not_analyzed" + }, + "platform": { + "type": "string", + "index": "not_analyzed" + }, + "pp_xoffset_max": { + "type": "long" + }, + "pp_xoffset_min": { + "type": "long" + }, + "pp_yoffset_max": { + "type": "long" + }, + "pp_yoffset_min": { + "type": "long" + }, + "refr_medium": { + "type": "string", + "index": "not_analyzed" + }, + "refr_source": { + "type": "string", + "index": "not_analyzed" + }, + "refr_term": { + "type": "string", + "index": "not_analyzed" + }, + "refr_urlfragment": { + "type": "string", + "index": "not_analyzed" + }, + "refr_urlhost": { + "type": "string", + "index": "not_analyzed" + }, + "refr_urlpath": { + "type": "string", + "index": "not_analyzed" + }, + "refr_urlport": { + "type": "long" + }, + "refr_urlquery": { + "type": "string", + "index": "not_analyzed" + }, + "refr_urlscheme": { + "type": "string", + "index": "not_analyzed" + }, + "se_action": { + "type": "string", + "index": "not_analyzed" + }, + "se_category": { + "type": "string", + "index": "not_analyzed" + }, + "se_label": { + "type": "string", + "index": "not_analyzed" + }, + "user_fingerprint": { + "type": "string", + "index": "not_analyzed" + }, + "user_id": { + "type": "string", + "index": "not_analyzed" + }, + "user_ipaddress": { + "type": "string", + "index": "not_analyzed" + }, + "useragent": { + "type": "string", + "index": "not_analyzed" + }, + "v_collector": { + "type": "string", + "index": "not_analyzed" + }, + "v_etl": { + "type": "string", + "index": "not_analyzed" + }, + "v_tracker": { + "type": "string", + "index": "not_analyzed" + } + } + } + } +} diff --git a/resources/event-dictionary/README.md b/resources/event-dictionary/README.md new file mode 100644 index 00000000..0a226b6a --- /dev/null +++ b/resources/event-dictionary/README.md @@ -0,0 +1,162 @@ +# example event dictionary + +## Overview + +Before you can send your own event and context types into Snowplow (using the track unstructured events and custom contexts features of Snowplow), you need to: + +1. Define a JSON schema for each of the events and context types +2. Upload those schemas to your Iglu schema repository +3. Define a corresponding jsonpath file, and make sure this is uploaded your jsonpaths directory in Amazon S3 +4. Create a corresponding Redshfit table definition, and create this table in your Redshift cluster + +Once you have completed the above, you can send in data that conforms to the schemas as custom unstructured events or custom contexts. + +## Prerequisites + +We recommend setting up the following 3 tools before staring: + +1. Git so you can easily clone the repo and make updates to it +2. [Schema Guru] [schema-guru-github]. This will auto-generate your jsonpath and sql table definitions +3. The [AWS CLI] [aws-cli]. This will make it easy to push updates to Iglu at the command line. + + +## 1. Creating the schemas + +In order to start sending a new event or context type into Snowplow, you first need to define a new schema for that event. + +1. Create a file in the repo for the new schema e.g. `/schemas/com.mycompany/new_event_or_context_name/jsonschema/1-0-0` +2. Create the schema in that file. Follow the `/schemas/com.example_company/example_event/jsonschema/1-0-0` +3. Save the file schema + +Note that if you have JSON data already and you want to create a corresponding schema, you can do so using [Schema Guru][schema-guru-online], both the [web UI][schema-guru-online] and the [CLI][schema-guru-github]. + +## 2. Uploading the schemas to Iglu + +Once you've created your schemas, you need to upload them to Iglu. In practice, this means copying them into S3. + +This can be done directly via the [AWS CLI](http://aws.amazon.com/cli/). In the project root, first commit the schema to Git: + +``` +git add . +git commit -m "Committed finalized schema" +git push +``` + +Then push it to Iglu. Note that as a trial user you will have to ask the Snowplow team to do this for you. As a Managed Services +customer you would be able to do it yourself as follows: + +``` +aws s3 cp schemas s3://{{ s3 bucket for schemas }}/schemas --include "*" --recursive +``` + +Useful resources + +* [Iglu schema repository 0.1.0 release blog post](http://snowplowanalytics.com/blog/2014/07/01/iglu-schema-repository-released/) +* [Iglu central](https://github.com/snowplow/iglu-central) - centralized repository for all the schemas hosted by the Snowplow team +* [Iglu](https://github.com/snowplow/iglu) - respository with both Iglu server and client libraries + + +## 3. Creating the jsonpath files and SQL table definitions + +Once you've defined the jsonschema for your new event or context type you need to create a correpsonding jsonpath file and sql table definition. This can be done programmatically using [Schema Guru] [schema-guru-github]. From the root of the repo: + +``` +/path/to/schema-guru-0.4.0 ddl --with-json-paths schemas/com.mycompany/new_event_or_context_name +``` + +A corresponding jsonpath file and sql table definition file will be generated in the appropriate folder in the repo. + +Note that you can create SQL table definition and jsonpath files for all the events / contexts schema'd as follows: + +``` +/path/to/schema-guru-0.4.0 ddl --with-json-paths schemas/com.mycompany +``` + + +## 4. Uploading the jsonpath files to Iglu + +One you've finalized the new jsonpath file, commit it to Git. From the project root: + +``` +git add . +git commit -m "Committed finalized jsonpath" +git push +``` + +Then push to Iglu. Again, you can only do this yourself as a Managed Services customers. As a trial user you will need to +ask a member of the Snowplow Analytics team to do this for you. + +``` +aws s3 cp jsonpaths s3://{{ s3 bucket for jsonpath files }}/jsonpaths --include "*" --recursive +``` + +## 5. Creating or updating the table definition in Redshift + +Once you've committed your updated table definition into Github, you need to either create or modify the table in Redshift, either by executing the `CREATE TABLE...` statement directly, or `ALTER TABLE` (if you're e.g. adding a column to an existing table). + +## 6. Sending data into Snowplow using the schema reference as custom unstructured events or contexts + +Once you have gone through the above process, you can start sending data that conforms to the schema(s) you've created into Snowplow as unstructured events and custom contexts. + +In both cases (custom unstructured events and contexts), the data is sent in as a JSON with two fields, a schema field with a reference to the location of the schema in Iglu, and a data field, with the actual data being sent, e.g. + +```json +{ + "schema": "iglu: com.acme_company/viewed_product/jsonschema/2-0-0", + "data": { + "productId": "ASO01043", + "category": "Dresses", + "brand": "ACME", + "price": 49.95, + "sizes": [ + "xs", + "s", + "l", + "xl", + "xxl" + ], + "availableSince": "2013-03-07" + } +} +``` + +For more detail, please see the technical documentation for the specific tracker you're implementing. + +Note: we recommend testing that the data you're sending into Snowplow conforms to the schemas you've defined and uploaded into Iglu, before pushing updates into production. This [online JSON schema validator](http://jsonschemalint.com/draft4/) is a very useful resource for doing so. + +## 7. Managing schema migrations + +When you use Snowplow, the schema for each event and context lives with the data. That means you have the flexibility to evolve your schema definition over time. + +If you want to change your schema over time, you will need to: + +1. Create a new jsonschema file. Depending on how different this is to your current version, you will need to give it the appropriate version number. The [SchemaVer][schema-ver] specification we use when versioning data schemas can be found [here][schema-ver] +2. Update the corresponding jsonpath files. If you've created a new major schema version, you'll need to create a new jsonpath file e.g. `exmaple_event_2.json`, that exists alongside your existing `example_event_1.json` +3. For minor schema updates, you should be able to update your existing Redshift table definition e.g. to add add additional columns. For major schema updates, you'll need to create a new Redshift table definition e.g. `com_mycompany_exmaple_event_2.sql` +4. Start sending data into Snowplow using the new schema version (i.e. update the Iglu reference to point at the new version e.g. `2-0-0` or `1-0-1` rather than `1-0-0`). Note that you will continue to be able to send in data that conforms to the old schema at the same time. In the event that you have an event with two different major schema definitions, each event version will be loaded into a different Redshift table + +## Additional resources + +Documentation on jsonschemas: + +* Other example jsonschemas can be found in [Iglu Central](https://github.com/snowplow/iglu-central/tree/master/schemas). Note how schemas are namespaced in different folders +* [Schema Guru] [schema-guru-online] is an [online] [schema-guru-online] and [command line tool] [schema-guru-github] for programmatically generating schemas from existing JSON data +* [Snowplow 0.9.5 release blog post](http://snowplowanalytics.com/blog/2014/07/09/snowplow-0.9.5-released-with-json-validation-shredding/), which gives an overview of the way that Snowplow uses jsonschemas to process, validate and shred unstructured event and custom context JSONs +* It can be useful to test jsonschemas using online validators e.g. [this one](http://jsonschemalint.com/draft4/) +* [json-schema.org](http://json-schema.org/) contains links to the actual jsonschema specification, examples and guide for schema authors +* The original specification for self-describing JSONs, produced by the Snowplow team, can be found [here](http://snowplowanalytics.com/blog/2014/05/15/introducing-self-describing-jsons/) + +Documentation on jsonpaths: + +* Example jsonpath files can be found on the [Snowplow repo](https://github.com/snowplow/snowplow/tree/master/4-storage/redshift-storage/jsonpaths). Note that the corresponding jsonschema definitions are stored in [Iglu central](https://github.com/snowplow/iglu-central/tree/master/schemas) +* Amazon documentation on jsonpath files can be found [here](http://docs.aws.amazon.com/redshift/latest/dg/copy-usage_notes-copy-from-json.html) + +Documentaiton on creating tablels in Redshift: + +* Example Redshift table definitions can be found on the [Snowplow repo](https://github.com/snowplow/snowplow/tree/master/4-storage/redshift-storage/sql). Note that corresponding jsonschema definitions are stored in [Iglu central](https://github.com/snowplow/iglu-central/tree/master/schemas) +* Amazon documentation on Redshift create table statements can be found [here](http://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_TABLE_NEW.html). A list of Redshift data types can be found [here](http://docs.aws.amazon.com/redshift/latest/dg/c_Supported_data_types.html) + +[schema-guru-online]: http://schemaguru.snowplowanalytics.com/ +[schema-guru-github]: https://github.com/snowplow/schema-guru?_sp=44dbe9a530cc476d.1436355830779 +[aws-cli]: https://aws.amazon.com/cli/ +[schema-ver]: http://snowplowanalytics.com/blog/2014/05/13/introducing-schemaver-for-semantic-versioning-of-schemas/ diff --git a/resources/event-dictionary/jsonpaths/com.example_company/example_event_1.json b/resources/event-dictionary/jsonpaths/com.example_company/example_event_1.json new file mode 100644 index 00000000..efa95da7 --- /dev/null +++ b/resources/event-dictionary/jsonpaths/com.example_company/example_event_1.json @@ -0,0 +1,20 @@ +{ + "jsonpaths": [ + + "$.schema.vendor", + "$.schema.name", + "$.schema.format", + "$.schema.version", + + "$.hierarchy.rootId", + "$.hierarchy.rootTstamp", + "$.hierarchy.refRoot", + "$.hierarchy.refTree", + "$.hierarchy.refParent", + + "$.data.exampleStringField", + "$.data.exampleIntegerField", + "$.data.exampleNumericField", + "$.data.exampleTimestampField" + ] +} \ No newline at end of file diff --git a/resources/event-dictionary/schemas/com.example_company/example_event/jsonschema/1-0-0 b/resources/event-dictionary/schemas/com.example_company/example_event/jsonschema/1-0-0 new file mode 100644 index 00000000..53f04516 --- /dev/null +++ b/resources/event-dictionary/schemas/com.example_company/example_event/jsonschema/1-0-0 @@ -0,0 +1,32 @@ +{ + "$schema": "http://iglucentral.com/schemas/com.snowplowanalytics.self-desc/schema/jsonschema/1-0-0#", + "description": "Schema for an example event", + "self": { + "vendor": "com.example_company", + "name": "example_event", + "format": "jsonschema", + "version": "1-0-0" + }, + + "type": "object", + "properties": { + "exampleStringField": { + "type": "string", + "maxLength": 255 + }, + "exampleIntegerField": { + "type": "integer" + }, + "exampleNumericField": { + "type": "number", + "maxDecimal": 3 + }, + "exampleTimestampField": { + "type": "string", + "format": "date-time" + } + }, + "minProperties":1, + "required": ["exampleStringField", "exampleIntegerField"], + "additionalProperties": false +} diff --git a/resources/event-dictionary/sql/com.example_company/example_event_1.sql b/resources/event-dictionary/sql/com.example_company/example_event_1.sql new file mode 100644 index 00000000..37649252 --- /dev/null +++ b/resources/event-dictionary/sql/com.example_company/example_event_1.sql @@ -0,0 +1,24 @@ +-- Compatibility: iglu:com.example_company/example_event/jsonschema/1-0-0 + +CREATE TABLE atomic.com_example_company_example_event_1 ( + -- Schema of this type + schema_vendor varchar(128) encode runlength not null, + schema_name varchar(128) encode runlength not null, + schema_format varchar(128) encode runlength not null, + schema_version varchar(128) encode runlength not null, + -- Parentage of this type + root_id char(36) encode raw not null, + root_tstamp timestamp encode raw not null, + ref_root varchar(255) encode runlength not null, + ref_tree varchar(1500) encode runlength not null, + ref_parent varchar(255) encode runlength not null, + -- Properties of this type + example_string_field varchar(255) not null, + example_integer_field integer not null, + example_numeric_field decimal(8,2), + example_timestamp_field timestamp +) +DISTSTYLE KEY +-- Optimized join to atomic.events +DISTKEY (root_id) +SORTKEY (root_tstamp); \ No newline at end of file diff --git a/resources/kibana/kibana4_init b/resources/kibana/kibana4_init new file mode 100755 index 00000000..da5b12da --- /dev/null +++ b/resources/kibana/kibana4_init @@ -0,0 +1,87 @@ +#!/bin/sh +# +# /etc/init.d/kibana4_init -- startup script for kibana4 +# bsmith@the408.com 2015-02-20; used elasticsearch init script as template +# https://github.com/akabdog/scripts/edit/master/kibana4_init +# +### BEGIN INIT INFO +# Provides: kibana4_init +# Required-Start: $network $remote_fs $named +# Required-Stop: $network $remote_fs $named +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Starts kibana4_init +# Description: Starts kibana4_init using start-stop-daemon +### END INIT INFO + +#configure this with wherever you unpacked kibana: +KIBANA_BIN=/opt/kibana/bin + +NAME=kibana4 +PID_FILE=/var/run/$NAME.pid +PATH=/bin:/usr/bin:/sbin:/usr/sbin:$KIBANA_BIN +DAEMON=$KIBANA_BIN/kibana +DESC="Kibana4" + +if [ `id -u` -ne 0 ]; then + echo "You need root privileges to run this script" + exit 1 +fi + +. /lib/lsb/init-functions + +if [ -r /etc/default/rcS ]; then + . /etc/default/rcS +fi + +case "$1" in + start) + log_daemon_msg "Starting $DESC" + + pid=`pidofproc -p $PID_FILE kibana` + if [ -n "$pid" ] ; then + log_begin_msg "Already running." + log_end_msg 0 + exit 0 + fi + + # Start Daemon + start-stop-daemon --start --pidfile "$PID_FILE" --make-pidfile --background --exec $DAEMON + log_end_msg $? + ;; + stop) + log_daemon_msg "Stopping $DESC" + + if [ -f "$PID_FILE" ]; then + start-stop-daemon --stop --pidfile "$PID_FILE" \ + --retry=TERM/20/KILL/5 >/dev/null + if [ $? -eq 1 ]; then + log_progress_msg "$DESC is not running but pid file exists, cleaning up" + elif [ $? -eq 3 ]; then + PID="`cat $PID_FILE`" + log_failure_msg "Failed to stop $DESC (pid $PID)" + exit 1 + fi + rm -f "$PID_FILE" + else + log_progress_msg "(not running)" + fi + log_end_msg 0 + ;; + status) + status_of_proc -p $PID_FILE kibana kibana && exit 0 || exit $? + ;; + restart|force-reload) + if [ -f "$PID_FILE" ]; then + $0 stop + sleep 1 + fi + $0 start + ;; + *) + log_success_msg "Usage: $0 {start|stop|restart|force-reload|status}" + exit 1 + ;; +esac + +exit 0 diff --git a/scripts/1_setup_docker.sh b/scripts/1_setup_docker.sh new file mode 100755 index 00000000..5025b05f --- /dev/null +++ b/scripts/1_setup_docker.sh @@ -0,0 +1,61 @@ +#!/bin/bash -e + +apt-get update +apt-get install -y unzip + +############# +# Constants # +############# + +main_dir=/home/ubuntu/snowplow + +configs_dir=$main_dir/configs +staging_dir=$main_dir/staging +executables_dir=$main_dir/bin +unix_pipes_dir=$main_dir/pipes +es_dir=$main_dir/elasticsearch +scripts_dir=$main_dir/scripts + +raw_events_pipe=$unix_pipes_dir/raw-events-pipe +enriched_pipe=$unix_pipes_dir/enriched-events-pipe + +kinesis_package=snowplow_kinesis_r67_bohemian_waxwing.zip +kibana_v=4.0.1 + +########################### +# Setup Directories/Files # +########################### + +mkdir -p $configs_dir +mkdir -p $staging_dir +mkdir -p $executables_dir +mkdir -p $unix_pipes_dir +mkdir -p $es_dir +mkdir -p $scripts_dir + +mkfifo $raw_events_pipe +mkfifo $enriched_pipe + +################################ +# Install Kinesis Applications # +################################ + +wget http://dl.bintray.com/snowplow/snowplow-generic/${kinesis_package} -P $staging_dir +unzip $staging_dir/${kinesis_package} -d $executables_dir + +######################### +# Install Elasticsearch # +######################### + +wget -qO - https://packages.elastic.co/GPG-KEY-elasticsearch | apt-key add - +echo "deb http://packages.elastic.co/elasticsearch/1.4/debian stable main" | tee -a /etc/apt/sources.list +apt-get update -y && apt-get install elasticsearch -y +/usr/share/elasticsearch/bin/plugin --install mobz/elasticsearch-head + +################## +# Install Kibana # +################## + +wget "https://download.elasticsearch.org/kibana/kibana/kibana-${kibana_v}-linux-x64.zip" -P $staging_dir +unzip $staging_dir/kibana-${kibana_v}-linux-x64.zip -d /opt/ +ln -s /opt/kibana-${kibana_v}-linux-x64 /opt/kibana diff --git a/scripts/1_setup_packer.sh b/scripts/1_setup_packer.sh new file mode 100755 index 00000000..60fb844d --- /dev/null +++ b/scripts/1_setup_packer.sh @@ -0,0 +1,83 @@ +#!/bin/bash -e + +sudo apt-get update +sudo apt-get install -y unzip + +############# +# Constants # +############# + +main_dir=/home/ubuntu/snowplow + +configs_dir=$main_dir/configs +staging_dir=$main_dir/staging +executables_dir=$main_dir/bin +unix_pipes_dir=$main_dir/pipes +es_dir=$main_dir/elasticsearch +scripts_dir=$main_dir/scripts + +raw_events_pipe=$unix_pipes_dir/raw-events-pipe +enriched_pipe=$unix_pipes_dir/enriched-events-pipe + +kinesis_package=snowplow_kinesis_r67_bohemian_waxwing.zip +kibana_v=4.0.1 + +########################### +# Setup Directories/Files # +########################### + +mkdir -p $configs_dir +mkdir -p $staging_dir +mkdir -p $executables_dir +mkdir -p $unix_pipes_dir +mkdir -p $es_dir +mkdir -p $scripts_dir + +mkfifo $raw_events_pipe +mkfifo $enriched_pipe + +################## +# Install Java 7 # +################## + +sudo add-apt-repository ppa:webupd8team/java -y +sudo apt-get update +echo oracle-java7-installer shared/accepted-oracle-license-v1-1 select true | sudo /usr/bin/debconf-set-selections +sudo apt-get install oracle-java7-installer -y + +################################ +# Install Kinesis Applications # +################################ + +wget http://dl.bintray.com/snowplow/snowplow-generic/${kinesis_package} -P $staging_dir +unzip $staging_dir/${kinesis_package} -d $executables_dir + +######################### +# Install Elasticsearch # +######################### + +wget -qO - https://packages.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add - +echo "deb http://packages.elastic.co/elasticsearch/1.4/debian stable main" | sudo tee -a /etc/apt/sources.list +sudo apt-get update -y && sudo apt-get install elasticsearch -y +sudo update-rc.d elasticsearch defaults 95 10 +sudo /usr/share/elasticsearch/bin/plugin --install mobz/elasticsearch-head + +################## +# Install Kibana # +################## + +wget "https://download.elasticsearch.org/kibana/kibana/kibana-${kibana_v}-linux-x64.zip" -P $staging_dir +sudo unzip $staging_dir/kibana-${kibana_v}-linux-x64.zip -d /opt/ +sudo ln -s /opt/kibana-${kibana_v}-linux-x64 /opt/kibana + +sudo chown -R ubuntu:ubuntu $main_dir + +################ +# Add Mappings # +################ + +sudo service elasticsearch start +sleep 15 + +curl -XPUT 'http://localhost:9200/good' -d @${es_dir}/good-mapping.json +curl -XPUT 'http://localhost:9200/bad' -d @${es_dir}/bad-mapping.json diff --git a/scripts/2_run_docker.sh b/scripts/2_run_docker.sh new file mode 100755 index 00000000..bdf02eed --- /dev/null +++ b/scripts/2_run_docker.sh @@ -0,0 +1,40 @@ +#!/bin/bash -e + +############# +# Constants # +############# + +main_dir=/home/ubuntu/snowplow + +configs_dir=$main_dir/configs +executables_dir=$main_dir/bin +unix_pipes_dir=$main_dir/pipes +es_dir=$main_dir/elasticsearch +scripts_dir=$main_dir/scripts + +raw_events_pipe=$unix_pipes_dir/raw-events-pipe +enriched_pipe=$unix_pipes_dir/enriched-events-pipe + +##################### +# Start ES + Kibana # +##################### + +service elasticsearch start +service kibana4_init start + +sleep 15 + +################ +# Add Mappings # +################ + +curl -XPUT 'http://localhost:9200/good' -d @${es_dir}/good-mapping.json +curl -XPUT 'http://localhost:9200/bad' -d @${es_dir}/bad-mapping.json + +################################################# +# Start Collector/Enrichment/Elasticsearch Sink # +################################################# + +${executables_dir}/snowplow-stream-collector-0.5.0 --config ${configs_dir}/scala-stream-collector.hocon > $raw_events_pipe & +cat $raw_events_pipe | ${executables_dir}/snowplow-kinesis-enrich-0.6.0 --config ${configs_dir}/scala-kinesis-enrich.hocon --resolver file:${configs_dir}/default-iglu-resolver.json > $enriched_pipe & +cat $enriched_pipe | ${executables_dir}/snowplow-elasticsearch-sink-0.4.0 --config ${configs_dir}/kinesis-elasticsearch-sink-good.hocon | ${scripts_dir}/elasticsearch_upload.pl diff --git a/scripts/2_run_packer b/scripts/2_run_packer new file mode 100755 index 00000000..be7d5da1 --- /dev/null +++ b/scripts/2_run_packer @@ -0,0 +1,30 @@ +#!/bin/bash -e + +############# +# Constants # +############# + +main_dir=/home/ubuntu/snowplow + +configs_dir=$main_dir/configs +executables_dir=$main_dir/bin +unix_pipes_dir=$main_dir/pipes +scripts_dir=$main_dir/scripts + +raw_events_pipe=$unix_pipes_dir/raw-events-pipe +enriched_pipe=$unix_pipes_dir/enriched-events-pipe + +##################### +# Start ES + Kibana # +##################### + +sudo service elasticsearch start +sudo service kibana4_init start + +################################################# +# Start Collector/Enrichment/Elasticsearch Sink # +################################################# + +${executables_dir}/snowplow-stream-collector-0.5.0 --config ${configs_dir}/scala-stream-collector.hocon > $raw_events_pipe & +cat $raw_events_pipe | ${executables_dir}/snowplow-kinesis-enrich-0.6.0 --config ${configs_dir}/scala-kinesis-enrich.hocon --resolver file:${configs_dir}/default-iglu-resolver.json > $enriched_pipe & +cat $enriched_pipe | ${executables_dir}/snowplow-elasticsearch-sink-0.4.0 --config ${configs_dir}/kinesis-elasticsearch-sink-good.hocon | ${scripts_dir}/elasticsearch_upload.pl & diff --git a/scripts/elasticsearch_upload.pl b/scripts/elasticsearch_upload.pl new file mode 100755 index 00000000..48a30cb0 --- /dev/null +++ b/scripts/elasticsearch_upload.pl @@ -0,0 +1,15 @@ +#!/usr/bin/perl -w + +use strict; + +sub send_to_elastic { + my $to_send = $_[0]; + `curl -XPOST http://localhost:9200/good/good -d '$to_send'` +} + +while () +{ + chomp; + my $ln = $_; + send_to_elastic($ln); +} diff --git a/ui/setup-page.graffle b/ui/setup-page.graffle new file mode 100644 index 00000000..a8e751e6 Binary files /dev/null and b/ui/setup-page.graffle differ diff --git a/ui/setup-page.png b/ui/setup-page.png new file mode 100644 index 00000000..08ab149f Binary files /dev/null and b/ui/setup-page.png differ diff --git a/vagrant/.gitignore b/vagrant/.gitignore new file mode 100644 index 00000000..1b4b29ff --- /dev/null +++ b/vagrant/.gitignore @@ -0,0 +1,3 @@ +.peru +oss-playbooks +ansible diff --git a/vagrant/ansible.hosts b/vagrant/ansible.hosts new file mode 100644 index 00000000..588fa08c --- /dev/null +++ b/vagrant/ansible.hosts @@ -0,0 +1,2 @@ +[vagrant] +127.0.0.1:2222 diff --git a/vagrant/peru.yaml b/vagrant/peru.yaml new file mode 100644 index 00000000..e7fdf41c --- /dev/null +++ b/vagrant/peru.yaml @@ -0,0 +1,14 @@ +imports: + ansible: ansible + ansible_playbooks: oss-playbooks + +curl module ansible: + # Equivalent of git cloning tags/v1.6.6 but much, much faster + url: https://codeload.github.com/ansible/ansible/zip/69d85c22c7475ccf8169b6ec9dee3ee28c92a314 + unpack: zip + export: ansible-69d85c22c7475ccf8169b6ec9dee3ee28c92a314 + +git module ansible_playbooks: + url: https://github.com/snowplow/ansible-playbooks.git + # Comment out to fetch a specific rev instead of master: + # rev: xxx diff --git a/vagrant/push.bash b/vagrant/push.bash new file mode 100755 index 00000000..4c7a1384 --- /dev/null +++ b/vagrant/push.bash @@ -0,0 +1,97 @@ +#!/bin/bash +set -e + +bintray_user=snowplowbot +bintray_repository=snowplow-docker-snowplow-docker.bintray.io +bintray_email=systems@snowplowanalytics.com +img_name=generic/snowplow-mini +tar_name=generic-snowplow-mini + +# Similar to Perl die +function die() { + echo "$@" 1>&2 ; exit 1; +} + +# Check if our Vagrant box is running. Expects `vagrant status` to look like: +# +# > Current machine states: +# > +# > default poweroff (virtualbox) +# > +# > The VM is powered off. To restart the VM, simply run `vagrant up` +# +# Parameters: +# 1. out_running (out parameter) +function is_running { + [ "$#" -eq 1 ] || die "1 argument required, $# provided" + local __out_running=$1 + + set +e + vagrant status | sed -n 3p | grep -q "^default\s*running (virtualbox)$" + local retval=${?} + set -e + if [ ${retval} -eq "0" ] ; then + eval ${__out_running}=1 + else + eval ${__out_running}=0 + fi +} + +# Get version, checking we are on the latest +# +# Parameters: +# 1. out_version (out parameter) +# 2. out_error (out parameter) +function get_version { + [ "$#" -eq 2 ] || die "2 arguments required, $# provided" + local __out_version=$1 + local __out_error=$2 + + file_version=`cat VERSION` + expected_tag="$file_version" + tag_version=`git describe --abbrev=0 --tags` + if [ ${expected_tag} != ${tag_version} ] ; then + eval ${__out_error}="'File version ${expected_tag} != tag version ${tag_version}'" + else + eval ${__out_version}=${expected_tag} + fi +} + +# Go to parent-parent dir of this script +function cd_root() { + source="${BASH_SOURCE[0]}" + while [ -h "${source}" ] ; do source="$(readlink "${source}")"; done + dir="$( cd -P "$( dirname "${source}" )/.." && pwd )" + cd ${dir} +} + +cd_root + +# Precondition for running +running=0 && is_running "running" +[ ${running} -eq 1 ] || die "Vagrant guest must be running to push" + +# Git tag must match version in package.json +version=`cat VERSION` +#version="" && error="" && get_version "version" "error" +#[ "${error}" ] && die "Versions don't match: ${error}. Are you trying to publish an old version, or maybe on the wrong branch?" + +# Can't pass args thru vagrant push so have to prompt +read -e -p "Please enter API key for Bintray user ${bintray_user}: " bintray_api_key + +# Build Docker Image +cmd="cd /vagrant && sudo docker build -t ${img_name}:${version} ." +vagrant ssh -c "${cmd}" + +# Get Image ID +cmd="sudo docker images | grep \"^${img_name}\" -m 1 | awk '{print \$3}'" +img_id=`vagrant ssh -c "${cmd}"` +[ "${img_id}" != "" ] || die "Image ID not found cannot push to Bintray." + +# Upload to Bintray +cmd="sudo docker login -u ${bintray_user} -p ${bintray_api_key} -e ${bintray_email} ${bintray_repository} && \ + sudo docker tag ${img_id:0:12} ${bintray_repository}/${img_name}:${version} && \ + sudo docker push ${bintray_repository}/${img_name}:${version}" +vagrant ssh -c "${cmd}" + +exit 0 diff --git a/vagrant/up.bash b/vagrant/up.bash new file mode 100755 index 00000000..7450ae89 --- /dev/null +++ b/vagrant/up.bash @@ -0,0 +1,50 @@ +#!/bin/bash +set -e + +vagrant_dir=/vagrant/vagrant +bashrc=/home/vagrant/.bashrc + +echo "========================================" +echo "INSTALLING PERU AND ANSIBLE DEPENDENCIES" +echo "----------------------------------------" +apt-get update +apt-get install -y language-pack-en git unzip libyaml-dev python3-pip python-yaml python-paramiko python-jinja2 + +echo "===============" +echo "INSTALLING PERU" +echo "---------------" +sudo pip3 install peru + +echo "=======================================" +echo "CLONING ANSIBLE AND PLAYBOOKS WITH PERU" +echo "---------------------------------------" +cd ${vagrant_dir} && peru sync -v +echo "... done" + +env_setup=${vagrant_dir}/ansible/hacking/env-setup +hosts=${vagrant_dir}/ansible.hosts + +echo "===================" +echo "CONFIGURING ANSIBLE" +echo "-------------------" +touch ${bashrc} +echo "source ${env_setup}" >> ${bashrc} +echo "export ANSIBLE_HOSTS=${hosts}" >> ${bashrc} +echo "... done" + +echo "==========================================" +echo "RUNNING PLAYBOOKS WITH ANSIBLE*" +echo "* no output while each playbook is running" +echo "------------------------------------------" +while read pb; do + su - -c "source ${env_setup} && ${vagrant_dir}/ansible/bin/ansible-playbook ${vagrant_dir}/${pb} --connection=local --inventory-file=${hosts}" vagrant +done <${vagrant_dir}/up.playbooks + +guidance=${vagrant_dir}/up.guidance + +if [ -f ${guidance} ]; then + echo "===========" + echo "PLEASE READ" + echo "-----------" + cat $guidance +fi diff --git a/vagrant/up.guidance b/vagrant/up.guidance new file mode 100644 index 00000000..0575dbc6 --- /dev/null +++ b/vagrant/up.guidance @@ -0,0 +1,3 @@ +To get started: +vagrant ssh +cd /vagrant diff --git a/vagrant/up.playbooks b/vagrant/up.playbooks new file mode 100644 index 00000000..c59a8819 --- /dev/null +++ b/vagrant/up.playbooks @@ -0,0 +1,2 @@ +oss-playbooks/packer.yml +oss-playbooks/docker.yml