-
-
Notifications
You must be signed in to change notification settings - Fork 176
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' of github.com:HTTPArchive/almanac.httparchive.org…
… into production
- Loading branch information
Showing
188 changed files
with
4,904 additions
and
122 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,7 +45,7 @@ jobs: | |
with: | ||
node-version: 12.x | ||
- name: Set up Python 3.8 | ||
uses: actions/[email protected].2 | ||
uses: actions/[email protected].3 | ||
with: | ||
python-version: '3.8' | ||
- name: Run the website | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,7 +27,7 @@ jobs: | |
with: | ||
version: 12.x | ||
- name: Set up Python 3.8 | ||
uses: actions/[email protected].2 | ||
uses: actions/[email protected].3 | ||
with: | ||
python-version: '3.8' | ||
- name: Install Asian Fonts | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,7 @@ jobs: | |
- name: Checkout Code | ||
uses: actions/[email protected] | ||
- name: Set up Python 3.8 | ||
uses: actions/[email protected].2 | ||
uses: actions/[email protected].3 | ||
with: | ||
python-version: '3.8' | ||
- name: Install Requirements | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,7 +22,7 @@ jobs: | |
with: | ||
node-version: 12.x | ||
- name: Set up Python 3.8 | ||
uses: actions/[email protected].2 | ||
uses: actions/[email protected].3 | ||
with: | ||
python-version: '3.8' | ||
- name: Run the website | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# CSS Queries | ||
|
||
## Query size warning | ||
|
||
The 2020 data in the [`parsed_css`](https://console.cloud.google.com/bigquery?p=httparchive&d=almanac&t=parsed_css&page=table) table is 9.7 TB, which is approximately $50 per query. | ||
|
||
When prototyping queries, it's advisable to use the [`parsed_css_1k`](https://console.cloud.google.com/bigquery?p=httparchive&d=almanac&t=parsed_css_1k&page=table) table instead, which only contains 1000 rows for easier testing. Make sure to switch this back to the full table when saving the results for analysis. | ||
|
||
## [CSS utils](../../lib/css-utils.js) | ||
|
||
- **Source**: https://github.com/LeaVerou/rework-utils/tree/master/src | ||
- **Playground**: https://projects.verou.me/rework-utils/ | ||
- **Docs**: https://projects.verou.me/rework-utils/docs/ | ||
|
||
This file provides JS utility functions to be used by the queries that depend on the `parsed_css` table. | ||
|
||
## Related resources | ||
|
||
- [Tracking issue](https://github.com/HTTPArchive/almanac.httparchive.org/issues/898) | ||
- [Draft doc](https://docs.google.com/document/d/1Cy9acip1ZQScoQEeds5-6l1FFFBJTJr4SheZiQxbj-Q/edit?usp=sharing) | ||
- [Results sheet](https://docs.google.com/spreadsheets/d/1sMWXWjMujqfAREYxNbG_t1fOJKYCA6ASLwtz4pBQVTw/edit?usp=sharing) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#standardSQL | ||
# 1. Distribution of the number of occurrences of box-sizing:border-box per page. | ||
# 2. Percent of pages with that style. | ||
CREATE TEMPORARY FUNCTION countBorderBoxDeclarations(css STRING) RETURNS NUMERIC LANGUAGE js AS ''' | ||
try { | ||
const ast = JSON.parse(css); | ||
return countDeclarations(ast.stylesheet.rules, {properties: /^(-(o|moz|webkit|ms)-)?box-sizing$/, values: 'border-box'}); | ||
} catch (e) { | ||
return null; | ||
} | ||
''' | ||
OPTIONS (library="gs://httparchive/lib/rework-utils.js"); | ||
|
||
SELECT | ||
percentile, | ||
client, | ||
COUNT(DISTINCT IF(declarations > 0, page, NULL)) AS pages, | ||
COUNT(DISTINCT page) AS total, | ||
COUNT(DISTINCT IF(declarations > 0, page, NULL)) / COUNT(DISTINCT page) AS pct_pages, | ||
APPROX_QUANTILES(declarations, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS declarations_per_page | ||
FROM ( | ||
SELECT | ||
client, | ||
page, | ||
countBorderBoxDeclarations(css) AS declarations | ||
FROM | ||
`httparchive.almanac.parsed_css` | ||
WHERE | ||
date = '2020-08-01'), | ||
UNNEST([10, 25, 50, 75, 90]) AS percentile | ||
GROUP BY | ||
percentile, | ||
client | ||
ORDER BY | ||
percentile, | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#standardSQL | ||
# CSS in JS. WIP | ||
CREATE TEMPORARY FUNCTION getCssInJS(payload STRING) | ||
RETURNS ARRAY<STRING> LANGUAGE js AS '''''' | ||
try { | ||
var $ = JSON.parse(payload); | ||
var css = JSON.parse($._css); | ||
|
||
return Array.isArray(css.css_in_js) && css.css_in_js.length > 0 ? css.css_in_js : [''NONE'']; | ||
} catch (e) { | ||
return [e.message]; | ||
} | ||
''''''; | ||
|
||
SELECT | ||
url, | ||
cssInJs | ||
FROM `httparchive.sample_data.pages_mobile_10k` | ||
CROSS JOIN UNNEST(getCssInJS(payload)) AS cssInJs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#standardSQL | ||
# Most popular custom property names as a percent of pages. | ||
CREATE TEMPORARY FUNCTION getCustomPropertyNames(payload STRING) RETURNS ARRAY<STRING> LANGUAGE js AS ''' | ||
try { | ||
var $ = JSON.parse(payload); | ||
var vars = JSON.parse($['_css-variables']); | ||
return Object.keys(vars.summary); | ||
} catch (e) { | ||
return []; | ||
} | ||
'''; | ||
|
||
SELECT | ||
client, | ||
name, | ||
COUNT(DISTINCT url) AS freq, | ||
total, | ||
COUNT(DISTINCT url) / total AS pct | ||
FROM ( | ||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
url, | ||
getCustomPropertyNames(payload) AS names, | ||
total | ||
FROM | ||
`httparchive.pages.2020_08_01_*` | ||
JOIN ( | ||
SELECT | ||
_TABLE_SUFFIX, | ||
COUNT(DISTINCT url) AS total | ||
FROM | ||
`httparchive.pages.2020_08_01_*` | ||
GROUP BY | ||
_TABLE_SUFFIX) | ||
USING (_TABLE_SUFFIX)), | ||
UNNEST(names) AS name | ||
GROUP BY | ||
client, | ||
name, | ||
total | ||
ORDER BY | ||
pct DESC | ||
LIMIT | ||
1000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#standardSQL | ||
# Most popular custom property values as a percent of pages. | ||
CREATE TEMPORARY FUNCTION getCustomPropertyValues(payload STRING) RETURNS ARRAY<STRING> LANGUAGE js AS ''' | ||
try { | ||
var $ = JSON.parse(payload); | ||
var vars = JSON.parse($['_css-variables']); | ||
return Object.values(vars.summary); | ||
} catch (e) { | ||
return []; | ||
} | ||
'''; | ||
|
||
SELECT | ||
client, | ||
value, | ||
COUNT(DISTINCT url) AS freq, | ||
total, | ||
COUNT(DISTINCT url) / total AS pct | ||
FROM ( | ||
SELECT | ||
_TABLE_SUFFIX AS client, | ||
url, | ||
getCustomPropertyValues(payload) AS values, | ||
total | ||
FROM | ||
`httparchive.pages.2020_08_01_*` | ||
JOIN ( | ||
SELECT | ||
_TABLE_SUFFIX, | ||
COUNT(DISTINCT url) AS total | ||
FROM | ||
`httparchive.pages.2020_08_01_*` | ||
GROUP BY | ||
_TABLE_SUFFIX) | ||
USING (_TABLE_SUFFIX)), | ||
UNNEST(values) AS value | ||
GROUP BY | ||
client, | ||
value, | ||
total | ||
ORDER BY | ||
pct DESC | ||
LIMIT | ||
1000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
#standardSQL | ||
CREATE TEMPORARY FUNCTION getSelectorParts(css STRING) | ||
RETURNS STRUCT< | ||
class ARRAY<STRUCT<name STRING, value INT64>>, | ||
id ARRAY<STRUCT<name STRING, value INT64>>, | ||
attribute ARRAY<STRUCT<name STRING, value INT64>>, | ||
pseudo_class ARRAY<STRUCT<name STRING, value INT64>>, | ||
pseudo_element ARRAY<STRUCT<name STRING, value INT64>> | ||
> LANGUAGE js AS ''' | ||
try { | ||
function compute(ast) { | ||
let ret = { | ||
class: {}, | ||
id: {}, | ||
attribute: {}, | ||
"pseudo-class": {}, | ||
"pseudo-element": {} | ||
}; | ||
walkSelectors(ast, selector => { | ||
let sast = parsel.parse(selector, {list: false}); | ||
parsel.walk(sast, node => { | ||
if (node.type in ret) { | ||
incrementByKey(ret[node.type], node.name); | ||
} | ||
}, {subtree: true}); | ||
}); | ||
for (let type in ret) { | ||
ret[type] = sortObject(ret[type]); | ||
} | ||
return ret; | ||
} | ||
function unzip(obj) { | ||
return Object.entries(obj).filter(([name, value]) => { | ||
return !isNaN(value); | ||
}).map(([name, value]) => ({name, value})); | ||
} | ||
const ast = JSON.parse(css); | ||
let parts = compute(ast); | ||
return { | ||
class: unzip(parts.class), | ||
id: unzip(parts.id), | ||
attribute: unzip(parts.attribute), | ||
pseudo_class: unzip(parts['pseudo-class']), | ||
pseudo_element: unzip(parts['pseudo-element']) | ||
} | ||
} catch (e) { | ||
return {class: [{name: e, value: 0}]}; | ||
} | ||
''' | ||
OPTIONS (library="gs://httparchive/lib/css-utils.js"); | ||
|
||
# https://www.stevenmoseley.com/blog/tech/high-performance-sql-correlated-scalar-aggregate-reduction-queries | ||
CREATE TEMPORARY FUNCTION encode(comparator STRING, data STRING) RETURNS STRING AS ( | ||
CONCAT(LPAD(comparator, 11, '0'), data) | ||
); | ||
CREATE TEMPORARY FUNCTION decode(value STRING) RETURNS STRING AS ( | ||
SUBSTR(value, 12) | ||
); | ||
|
||
WITH selector_parts AS ( | ||
SELECT | ||
client, | ||
page, | ||
url, | ||
getSelectorParts(css) AS parts | ||
FROM | ||
`httparchive.almanac.parsed_css` | ||
WHERE | ||
date = '2020-08-01' AND | ||
# Limit the size of the CSS to avoid OOM crashes. | ||
LENGTH(css) < 0.1 * 1024 * 1024 | ||
) | ||
|
||
SELECT | ||
client, | ||
decode(MAX(encode(CAST(class_freq AS STRING), class_name))) AS class_name, | ||
MAX(class_freq) AS class_freq, | ||
decode(MAX(encode(CAST(id_freq AS STRING), id_name))) AS id_name, | ||
MAX(id_freq) AS id_freq, | ||
decode(MAX(encode(CAST(attribute_freq AS STRING), attribute_name))) AS attribute_name, | ||
MAX(attribute_freq) AS attribute_freq, | ||
decode(MAX(encode(CAST(pseudo_class_freq AS STRING), pseudo_class_name))) AS pseudo_class_name, | ||
MAX(pseudo_class_freq) AS pseudo_class_freq, | ||
decode(MAX(encode(CAST(pseudo_element_freq AS STRING), pseudo_element_name))) AS pseudo_element_name, | ||
MAX(pseudo_element_freq) AS pseudo_element_freq | ||
FROM ( | ||
SELECT | ||
client, | ||
class.name AS class_name, | ||
SUM(class.value) OVER (PARTITION BY client, class.name) AS class_freq | ||
FROM | ||
selector_parts, | ||
UNNEST(parts.class) AS class) | ||
JOIN ( | ||
SELECT | ||
client, | ||
id.name AS id_name, | ||
SUM(id.value) OVER (PARTITION BY client, id.name) AS id_freq | ||
FROM | ||
selector_parts, | ||
UNNEST(parts.id) AS id) | ||
USING | ||
(client) | ||
JOIN ( | ||
SELECT | ||
client, | ||
attribute.name AS attribute_name, | ||
SUM(attribute.value) OVER (PARTITION BY client, attribute.name) AS attribute_freq | ||
FROM | ||
selector_parts, | ||
UNNEST(parts.attribute) AS attribute) | ||
USING | ||
(client) | ||
JOIN ( | ||
SELECT | ||
client, | ||
pseudo_class.name AS pseudo_class_name, | ||
SUM(pseudo_class.value) OVER (PARTITION BY client, pseudo_class.name) AS pseudo_class_freq | ||
FROM | ||
selector_parts, | ||
UNNEST(parts.pseudo_class) AS pseudo_class) | ||
USING | ||
(client) | ||
JOIN ( | ||
SELECT | ||
client, | ||
pseudo_element.name AS pseudo_element_name, | ||
SUM(pseudo_element.value) OVER (PARTITION BY client, pseudo_element.name) AS pseudo_element_freq | ||
FROM | ||
selector_parts, | ||
UNNEST(parts.pseudo_element) AS pseudo_element) | ||
USING | ||
(client) | ||
GROUP BY | ||
client |
Oops, something went wrong.