Skip to content

Commit

Permalink
Merge branch 'main' of github.com:HTTPArchive/almanac.httparchive.org…
Browse files Browse the repository at this point in the history
… into production
  • Loading branch information
tunetheweb committed Oct 6, 2020
2 parents 917c8da + 3233bb6 commit 4f49487
Show file tree
Hide file tree
Showing 188 changed files with 4,904 additions and 122 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/generate_chapters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
with:
node-version: 12.x
- name: Set up Python 3.8
uses: actions/[email protected].2
uses: actions/[email protected].3
with:
python-version: '3.8'
- name: Run the website
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/generate_ebooks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
with:
version: 12.x
- name: Set up Python 3.8
uses: actions/[email protected].2
uses: actions/[email protected].3
with:
python-version: '3.8'
- name: Install Asian Fonts
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
- name: Checkout Code
uses: actions/[email protected]
- name: Set up Python 3.8
uses: actions/[email protected].2
uses: actions/[email protected].3
with:
python-version: '3.8'
- name: Install Requirements
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test_website.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
with:
node-version: 12.x
- name: Set up Python 3.8
uses: actions/[email protected].2
uses: actions/[email protected].3
with:
python-version: '3.8'
- name: Run the website
Expand Down
21 changes: 21 additions & 0 deletions sql/2020/01_CSS/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# CSS Queries

## Query size warning

The 2020 data in the [`parsed_css`](https://console.cloud.google.com/bigquery?p=httparchive&d=almanac&t=parsed_css&page=table) table is 9.7 TB, which is approximately $50 per query.

When prototyping queries, it's advisable to use the [`parsed_css_1k`](https://console.cloud.google.com/bigquery?p=httparchive&d=almanac&t=parsed_css_1k&page=table) table instead, which only contains 1000 rows for easier testing. Make sure to switch this back to the full table when saving the results for analysis.

## [CSS utils](../../lib/css-utils.js)

- **Source**: https://github.com/LeaVerou/rework-utils/tree/master/src
- **Playground**: https://projects.verou.me/rework-utils/
- **Docs**: https://projects.verou.me/rework-utils/docs/

This file provides JS utility functions to be used by the queries that depend on the `parsed_css` table.

## Related resources

- [Tracking issue](https://github.com/HTTPArchive/almanac.httparchive.org/issues/898)
- [Draft doc](https://docs.google.com/document/d/1Cy9acip1ZQScoQEeds5-6l1FFFBJTJr4SheZiQxbj-Q/edit?usp=sharing)
- [Results sheet](https://docs.google.com/spreadsheets/d/1sMWXWjMujqfAREYxNbG_t1fOJKYCA6ASLwtz4pBQVTw/edit?usp=sharing)
36 changes: 36 additions & 0 deletions sql/2020/01_CSS/box_sizing.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#standardSQL
# 1. Distribution of the number of occurrences of box-sizing:border-box per page.
# 2. Percent of pages with that style.
CREATE TEMPORARY FUNCTION countBorderBoxDeclarations(css STRING) RETURNS NUMERIC LANGUAGE js AS '''
try {
const ast = JSON.parse(css);
return countDeclarations(ast.stylesheet.rules, {properties: /^(-(o|moz|webkit|ms)-)?box-sizing$/, values: 'border-box'});
} catch (e) {
return null;
}
'''
OPTIONS (library="gs://httparchive/lib/rework-utils.js");

SELECT
percentile,
client,
COUNT(DISTINCT IF(declarations > 0, page, NULL)) AS pages,
COUNT(DISTINCT page) AS total,
COUNT(DISTINCT IF(declarations > 0, page, NULL)) / COUNT(DISTINCT page) AS pct_pages,
APPROX_QUANTILES(declarations, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS declarations_per_page
FROM (
SELECT
client,
page,
countBorderBoxDeclarations(css) AS declarations
FROM
`httparchive.almanac.parsed_css`
WHERE
date = '2020-08-01'),
UNNEST([10, 25, 50, 75, 90]) AS percentile
GROUP BY
percentile,
client
ORDER BY
percentile,
client
19 changes: 19 additions & 0 deletions sql/2020/01_CSS/css_in_js.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#standardSQL
# CSS in JS. WIP
CREATE TEMPORARY FUNCTION getCssInJS(payload STRING)
RETURNS ARRAY<STRING> LANGUAGE js AS ''''''
try {
var $ = JSON.parse(payload);
var css = JSON.parse($._css);

return Array.isArray(css.css_in_js) && css.css_in_js.length > 0 ? css.css_in_js : [''NONE''];
} catch (e) {
return [e.message];
}
'''''';

SELECT
url,
cssInJs
FROM `httparchive.sample_data.pages_mobile_10k`
CROSS JOIN UNNEST(getCssInJS(payload)) AS cssInJs
44 changes: 44 additions & 0 deletions sql/2020/01_CSS/custom_property_names.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#standardSQL
# Most popular custom property names as a percent of pages.
CREATE TEMPORARY FUNCTION getCustomPropertyNames(payload STRING) RETURNS ARRAY<STRING> LANGUAGE js AS '''
try {
var $ = JSON.parse(payload);
var vars = JSON.parse($['_css-variables']);
return Object.keys(vars.summary);
} catch (e) {
return [];
}
''';

SELECT
client,
name,
COUNT(DISTINCT url) AS freq,
total,
COUNT(DISTINCT url) / total AS pct
FROM (
SELECT
_TABLE_SUFFIX AS client,
url,
getCustomPropertyNames(payload) AS names,
total
FROM
`httparchive.pages.2020_08_01_*`
JOIN (
SELECT
_TABLE_SUFFIX,
COUNT(DISTINCT url) AS total
FROM
`httparchive.pages.2020_08_01_*`
GROUP BY
_TABLE_SUFFIX)
USING (_TABLE_SUFFIX)),
UNNEST(names) AS name
GROUP BY
client,
name,
total
ORDER BY
pct DESC
LIMIT
1000
44 changes: 44 additions & 0 deletions sql/2020/01_CSS/custom_property_values.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#standardSQL
# Most popular custom property values as a percent of pages.
CREATE TEMPORARY FUNCTION getCustomPropertyValues(payload STRING) RETURNS ARRAY<STRING> LANGUAGE js AS '''
try {
var $ = JSON.parse(payload);
var vars = JSON.parse($['_css-variables']);
return Object.values(vars.summary);
} catch (e) {
return [];
}
''';

SELECT
client,
value,
COUNT(DISTINCT url) AS freq,
total,
COUNT(DISTINCT url) / total AS pct
FROM (
SELECT
_TABLE_SUFFIX AS client,
url,
getCustomPropertyValues(payload) AS values,
total
FROM
`httparchive.pages.2020_08_01_*`
JOIN (
SELECT
_TABLE_SUFFIX,
COUNT(DISTINCT url) AS total
FROM
`httparchive.pages.2020_08_01_*`
GROUP BY
_TABLE_SUFFIX)
USING (_TABLE_SUFFIX)),
UNNEST(values) AS value
GROUP BY
client,
value,
total
ORDER BY
pct DESC
LIMIT
1000
141 changes: 141 additions & 0 deletions sql/2020/01_CSS/selector_parts_freq_per_page.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#standardSQL
CREATE TEMPORARY FUNCTION getSelectorParts(css STRING)
RETURNS STRUCT<
class ARRAY<STRUCT<name STRING, value INT64>>,
id ARRAY<STRUCT<name STRING, value INT64>>,
attribute ARRAY<STRUCT<name STRING, value INT64>>,
pseudo_class ARRAY<STRUCT<name STRING, value INT64>>,
pseudo_element ARRAY<STRUCT<name STRING, value INT64>>
> LANGUAGE js AS '''
try {
function compute(ast) {
let ret = {
class: {},
id: {},
attribute: {},
"pseudo-class": {},
"pseudo-element": {}
};
walkSelectors(ast, selector => {
let sast = parsel.parse(selector, {list: false});
parsel.walk(sast, node => {
if (node.type in ret) {
incrementByKey(ret[node.type], node.name);
}
}, {subtree: true});
});
for (let type in ret) {
ret[type] = sortObject(ret[type]);
}
return ret;
}
function unzip(obj) {
return Object.entries(obj).filter(([name, value]) => {
return !isNaN(value);
}).map(([name, value]) => ({name, value}));
}
const ast = JSON.parse(css);
let parts = compute(ast);
return {
class: unzip(parts.class),
id: unzip(parts.id),
attribute: unzip(parts.attribute),
pseudo_class: unzip(parts['pseudo-class']),
pseudo_element: unzip(parts['pseudo-element'])
}
} catch (e) {
return {class: [{name: e, value: 0}]};
}
'''
OPTIONS (library="gs://httparchive/lib/css-utils.js");

# https://www.stevenmoseley.com/blog/tech/high-performance-sql-correlated-scalar-aggregate-reduction-queries
CREATE TEMPORARY FUNCTION encode(comparator STRING, data STRING) RETURNS STRING AS (
CONCAT(LPAD(comparator, 11, '0'), data)
);
CREATE TEMPORARY FUNCTION decode(value STRING) RETURNS STRING AS (
SUBSTR(value, 12)
);

WITH selector_parts AS (
SELECT
client,
page,
url,
getSelectorParts(css) AS parts
FROM
`httparchive.almanac.parsed_css`
WHERE
date = '2020-08-01' AND
# Limit the size of the CSS to avoid OOM crashes.
LENGTH(css) < 0.1 * 1024 * 1024
)

SELECT
client,
decode(MAX(encode(CAST(class_freq AS STRING), class_name))) AS class_name,
MAX(class_freq) AS class_freq,
decode(MAX(encode(CAST(id_freq AS STRING), id_name))) AS id_name,
MAX(id_freq) AS id_freq,
decode(MAX(encode(CAST(attribute_freq AS STRING), attribute_name))) AS attribute_name,
MAX(attribute_freq) AS attribute_freq,
decode(MAX(encode(CAST(pseudo_class_freq AS STRING), pseudo_class_name))) AS pseudo_class_name,
MAX(pseudo_class_freq) AS pseudo_class_freq,
decode(MAX(encode(CAST(pseudo_element_freq AS STRING), pseudo_element_name))) AS pseudo_element_name,
MAX(pseudo_element_freq) AS pseudo_element_freq
FROM (
SELECT
client,
class.name AS class_name,
SUM(class.value) OVER (PARTITION BY client, class.name) AS class_freq
FROM
selector_parts,
UNNEST(parts.class) AS class)
JOIN (
SELECT
client,
id.name AS id_name,
SUM(id.value) OVER (PARTITION BY client, id.name) AS id_freq
FROM
selector_parts,
UNNEST(parts.id) AS id)
USING
(client)
JOIN (
SELECT
client,
attribute.name AS attribute_name,
SUM(attribute.value) OVER (PARTITION BY client, attribute.name) AS attribute_freq
FROM
selector_parts,
UNNEST(parts.attribute) AS attribute)
USING
(client)
JOIN (
SELECT
client,
pseudo_class.name AS pseudo_class_name,
SUM(pseudo_class.value) OVER (PARTITION BY client, pseudo_class.name) AS pseudo_class_freq
FROM
selector_parts,
UNNEST(parts.pseudo_class) AS pseudo_class)
USING
(client)
JOIN (
SELECT
client,
pseudo_element.name AS pseudo_element_name,
SUM(pseudo_element.value) OVER (PARTITION BY client, pseudo_element.name) AS pseudo_element_freq
FROM
selector_parts,
UNNEST(parts.pseudo_element) AS pseudo_element)
USING
(client)
GROUP BY
client
Loading

0 comments on commit 4f49487

Please sign in to comment.