Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make detailed notes on how table, query and row views work right now #2054

Open
simonw opened this issue Apr 6, 2023 · 13 comments
Open

Make detailed notes on how table, query and row views work right now #2054

simonw opened this issue Apr 6, 2023 · 13 comments
Labels

Comments

@simonw
Copy link
Owner

simonw commented Apr 6, 2023

@simonw simonw added the research label Apr 6, 2023
@simonw
Copy link
Owner Author

simonw commented Apr 6, 2023

I'm going to make notes against the code in the most recent alpha release, ignoring the recent work I did to refactor TableView.

https://github.com/simonw/datasette/tree/1.0a2/datasette/views

@simonw
Copy link
Owner Author

simonw commented Apr 6, 2023

These classes - TableView and RowView and QueryView - all subclass DataView which subclasses BaseView.

@simonw
Copy link
Owner Author

simonw commented Apr 6, 2023

Here's BaseView:

class BaseView:
ds = None
has_json_alternate = True
def __init__(self, datasette):
self.ds = datasette
async def head(self, *args, **kwargs):
response = await self.get(*args, **kwargs)
response.body = b""
return response
def database_color(self, database):
return "ff0000"
async def method_not_allowed(self, request):
if (
request.path.endswith(".json")
or request.headers.get("content-type") == "application/json"
):
response = Response.json(
{"ok": False, "error": "Method not allowed"}, status=405
)
else:
response = Response.text("Method not allowed", status=405)
return response
async def options(self, request, *args, **kwargs):
return Response.text("ok")
async def get(self, request, *args, **kwargs):
return await self.method_not_allowed(request)
async def post(self, request, *args, **kwargs):
return await self.method_not_allowed(request)
async def put(self, request, *args, **kwargs):
return await self.method_not_allowed(request)
async def patch(self, request, *args, **kwargs):
return await self.method_not_allowed(request)
async def delete(self, request, *args, **kwargs):
return await self.method_not_allowed(request)
async def dispatch_request(self, request):
if self.ds:
await self.ds.refresh_schemas()
handler = getattr(self, request.method.lower(), None)
response = await handler(request)
if self.ds.cors:
add_cors_headers(response.headers)
return response
async def render(self, templates, request, context=None):
context = context or {}
template = self.ds.jinja_env.select_template(templates)
template_context = {
**context,
**{
"database_color": self.database_color,
"select_templates": [
f"{'*' if template_name == template.name else ''}{template_name}"
for template_name in templates
],
},
}
headers = {}
if self.has_json_alternate:
alternate_url_json = self.ds.absolute_url(
request,
self.ds.urls.path(path_with_format(request=request, format="json")),
)
template_context["alternate_url_json"] = alternate_url_json
headers.update(
{
"Link": '{}; rel="alternate"; type="application/json+datasette"'.format(
alternate_url_json
)
}
)
return Response.html(
await self.ds.render_template(
template,
template_context,
request=request,
view_name=self.name,
),
headers=headers,
)

It has methods for the options, get, post, delete, put, patch and head HTTP verbs, most defaulting to returinng a 405 Method not allowed message in plain text or JSON, depending on this check:

async def method_not_allowed(self, request):
if (
request.path.endswith(".json")
or request.headers.get("content-type") == "application/json"
):
response = Response.json(
{"ok": False, "error": "Method not allowed"}, status=405
)
else:
response = Response.text("Method not allowed", status=405)
return response

Also adds CORS headers to anything if CORS mode is on:

if self.ds.cors:
add_cors_headers(response.headers)

And adds the database_color (weirdly) and the select_templates variables to the template context:

template = self.ds.jinja_env.select_template(templates)
template_context = {
**context,
**{
"database_color": self.database_color,
"select_templates": [
f"{'*' if template_name == template.name else ''}{template_name}"
for template_name in templates
],
},
}

And has special code for setting the Link: ...; rel="alternate" HTTP header:

if self.has_json_alternate:
alternate_url_json = self.ds.absolute_url(
request,
self.ds.urls.path(path_with_format(request=request, format="json")),
)
template_context["alternate_url_json"] = alternate_url_json
headers.update(
{
"Link": '{}; rel="alternate"; type="application/json+datasette"'.format(
alternate_url_json
)
}
)

@simonw
Copy link
Owner Author

simonw commented Apr 6, 2023

The DataView class does a LOT of work - mostly involving CSV responses.

class DataView(BaseView):
name = ""
def redirect(self, request, path, forward_querystring=True, remove_args=None):
if request.query_string and "?" not in path and forward_querystring:
path = f"{path}?{request.query_string}"
if remove_args:
path = path_with_removed_args(request, remove_args, path=path)
r = Response.redirect(path)
r.headers["Link"] = f"<{path}>; rel=preload"
if self.ds.cors:
add_cors_headers(r.headers)
return r
async def data(self, request):
raise NotImplementedError
def get_templates(self, database, table=None):
assert NotImplemented
async def as_csv(self, request, database):
kwargs = {}
stream = request.args.get("_stream")
# Do not calculate facets or counts:
extra_parameters = [
"{}=1".format(key)
for key in ("_nofacet", "_nocount")
if not request.args.get(key)
]
if extra_parameters:
# Replace request object with a new one with modified scope
if not request.query_string:
new_query_string = "&".join(extra_parameters)
else:
new_query_string = (
request.query_string + "&" + "&".join(extra_parameters)
)
new_scope = dict(
request.scope, query_string=new_query_string.encode("latin-1")
)
receive = request.receive
request = Request(new_scope, receive)
if stream:
# Some quick soundness checks
if not self.ds.setting("allow_csv_stream"):
raise BadRequest("CSV streaming is disabled")
if request.args.get("_next"):
raise BadRequest("_next not allowed for CSV streaming")
kwargs["_size"] = "max"
# Fetch the first page
try:
response_or_template_contexts = await self.data(request)
if isinstance(response_or_template_contexts, Response):
return response_or_template_contexts
elif len(response_or_template_contexts) == 4:
data, _, _, _ = response_or_template_contexts
else:
data, _, _ = response_or_template_contexts
except (sqlite3.OperationalError, InvalidSql) as e:
raise DatasetteError(str(e), title="Invalid SQL", status=400)
except sqlite3.OperationalError as e:
raise DatasetteError(str(e))
except DatasetteError:
raise
# Convert rows and columns to CSV
headings = data["columns"]
# if there are expanded_columns we need to add additional headings
expanded_columns = set(data.get("expanded_columns") or [])
if expanded_columns:
headings = []
for column in data["columns"]:
headings.append(column)
if column in expanded_columns:
headings.append(f"{column}_label")
content_type = "text/plain; charset=utf-8"
preamble = ""
postamble = ""
trace = request.args.get("_trace")
if trace:
content_type = "text/html; charset=utf-8"
preamble = (
"<html><head><title>CSV debug</title></head>"
'<body><textarea style="width: 90%; height: 70vh">'
)
postamble = "</textarea></body></html>"
async def stream_fn(r):
nonlocal data, trace
limited_writer = LimitedWriter(r, self.ds.setting("max_csv_mb"))
if trace:
await limited_writer.write(preamble)
writer = csv.writer(EscapeHtmlWriter(limited_writer))
else:
writer = csv.writer(limited_writer)
first = True
next = None
while first or (next and stream):
try:
kwargs = {}
if next:
kwargs["_next"] = next
if not first:
data, _, _ = await self.data(request, **kwargs)
if first:
if request.args.get("_header") != "off":
await writer.writerow(headings)
first = False
next = data.get("next")
for row in data["rows"]:
if any(isinstance(r, bytes) for r in row):
new_row = []
for column, cell in zip(headings, row):
if isinstance(cell, bytes):
# If this is a table page, use .urls.row_blob()
if data.get("table"):
pks = data.get("primary_keys") or []
cell = self.ds.absolute_url(
request,
self.ds.urls.row_blob(
database,
data["table"],
path_from_row_pks(row, pks, not pks),
column,
),
)
else:
# Otherwise generate URL for this query
url = self.ds.absolute_url(
request,
path_with_format(
request=request,
format="blob",
extra_qs={
"_blob_column": column,
"_blob_hash": hashlib.sha256(
cell
).hexdigest(),
},
replace_format="csv",
),
)
cell = url.replace("&_nocount=1", "").replace(
"&_nofacet=1", ""
)
new_row.append(cell)
row = new_row
if not expanded_columns:
# Simple path
await writer.writerow(row)
else:
# Look for {"value": "label": } dicts and expand
new_row = []
for heading, cell in zip(data["columns"], row):
if heading in expanded_columns:
if cell is None:
new_row.extend(("", ""))
else:
assert isinstance(cell, dict)
new_row.append(cell["value"])
new_row.append(cell["label"])
else:
new_row.append(cell)
await writer.writerow(new_row)
except Exception as e:
sys.stderr.write("Caught this error: {}\n".format(e))
sys.stderr.flush()
await r.write(str(e))
return
await limited_writer.write(postamble)
headers = {}
if self.ds.cors:
add_cors_headers(headers)
if request.args.get("_dl", None):
if not trace:
content_type = "text/csv; charset=utf-8"
disposition = 'attachment; filename="{}.csv"'.format(
request.url_vars.get("table", database)
)
headers["content-disposition"] = disposition
return AsgiStream(stream_fn, headers=headers, content_type=content_type)
async def get(self, request):
db = await self.ds.resolve_database(request)
database = db.name
database_route = db.route
_format = request.url_vars["format"]
data_kwargs = {}
if _format == "csv":
return await self.as_csv(request, database_route)
if _format is None:
# HTML views default to expanding all foreign key labels
data_kwargs["default_labels"] = True
extra_template_data = {}
start = time.perf_counter()
status_code = None
templates = []
try:
response_or_template_contexts = await self.data(request, **data_kwargs)
if isinstance(response_or_template_contexts, Response):
return response_or_template_contexts
# If it has four items, it includes an HTTP status code
if len(response_or_template_contexts) == 4:
(
data,
extra_template_data,
templates,
status_code,
) = response_or_template_contexts
else:
data, extra_template_data, templates = response_or_template_contexts
except QueryInterrupted as ex:
raise DatasetteError(
textwrap.dedent(
"""
<p>SQL query took too long. The time limit is controlled by the
<a href="https://docs.datasette.io/en/stable/settings.html#sql-time-limit-ms">sql_time_limit_ms</a>
configuration option.</p>
<textarea style="width: 90%">{}</textarea>
<script>
let ta = document.querySelector("textarea");
ta.style.height = ta.scrollHeight + "px";
</script>
""".format(
escape(ex.sql)
)
).strip(),
title="SQL Interrupted",
status=400,
message_is_html=True,
)
except (sqlite3.OperationalError, InvalidSql) as e:
raise DatasetteError(str(e), title="Invalid SQL", status=400)
except sqlite3.OperationalError as e:
raise DatasetteError(str(e))
except DatasetteError:
raise
end = time.perf_counter()
data["query_ms"] = (end - start) * 1000
for key in ("source", "source_url", "license", "license_url"):
value = self.ds.metadata(key)
if value:
data[key] = value
# Special case for .jsono extension - redirect to _shape=objects
if _format == "jsono":
return self.redirect(
request,
path_with_added_args(
request,
{"_shape": "objects"},
path=request.path.rsplit(".jsono", 1)[0] + ".json",
),
forward_querystring=False,
)
if _format in self.ds.renderers.keys():
# Dispatch request to the correct output format renderer
# (CSV is not handled here due to streaming)
result = call_with_supported_arguments(
self.ds.renderers[_format][0],
datasette=self.ds,
columns=data.get("columns") or [],
rows=data.get("rows") or [],
sql=data.get("query", {}).get("sql", None),
query_name=data.get("query_name"),
database=database,
table=data.get("table"),
request=request,
view_name=self.name,
# These will be deprecated in Datasette 1.0:
args=request.args,
data=data,
)
if asyncio.iscoroutine(result):
result = await result
if result is None:
raise NotFound("No data")
if isinstance(result, dict):
r = Response(
body=result.get("body"),
status=result.get("status_code", status_code or 200),
content_type=result.get("content_type", "text/plain"),
headers=result.get("headers"),
)
elif isinstance(result, Response):
r = result
if status_code is not None:
# Over-ride the status code
r.status = status_code
else:
assert False, f"{result} should be dict or Response"
else:
extras = {}
if callable(extra_template_data):
extras = extra_template_data()
if asyncio.iscoroutine(extras):
extras = await extras
else:
extras = extra_template_data
url_labels_extra = {}
if data.get("expandable_columns"):
url_labels_extra = {"_labels": "on"}
renderers = {}
for key, (_, can_render) in self.ds.renderers.items():
it_can_render = call_with_supported_arguments(
can_render,
datasette=self.ds,
columns=data.get("columns") or [],
rows=data.get("rows") or [],
sql=data.get("query", {}).get("sql", None),
query_name=data.get("query_name"),
database=database,
table=data.get("table"),
request=request,
view_name=self.name,
)
it_can_render = await await_me_maybe(it_can_render)
if it_can_render:
renderers[key] = self.ds.urls.path(
path_with_format(
request=request, format=key, extra_qs={**url_labels_extra}
)
)
url_csv_args = {"_size": "max", **url_labels_extra}
url_csv = self.ds.urls.path(
path_with_format(request=request, format="csv", extra_qs=url_csv_args)
)
url_csv_path = url_csv.split("?")[0]
context = {
**data,
**extras,
**{
"renderers": renderers,
"url_csv": url_csv,
"url_csv_path": url_csv_path,
"url_csv_hidden_args": [
(key, value)
for key, value in urllib.parse.parse_qsl(request.query_string)
if key not in ("_labels", "_facet", "_size")
]
+ [("_size", "max")],
"settings": self.ds.settings_dict(),
},
}
if "metadata" not in context:
context["metadata"] = self.ds.metadata
r = await self.render(templates, request=request, context=context)
if status_code is not None:
r.status = status_code
ttl = request.args.get("_ttl", None)
if ttl is None or not ttl.isdigit():
ttl = self.ds.setting("default_cache_ttl")
return self.set_response_headers(r, ttl)
def set_response_headers(self, response, ttl):
# Set far-future cache expiry
if self.ds.cache_headers and response.status == 200:
ttl = int(ttl)
if ttl == 0:
ttl_header = "no-cache"
else:
ttl_header = f"max-age={ttl}"
response.headers["Cache-Control"] = ttl_header
response.headers["Referrer-Policy"] = "no-referrer"
if self.ds.cors:
add_cors_headers(response.headers)
return response

It has a redirect() method with some complex logic and CORS handling:

def redirect(self, request, path, forward_querystring=True, remove_args=None):
if request.query_string and "?" not in path and forward_querystring:
path = f"{path}?{request.query_string}"
if remove_args:
path = path_with_removed_args(request, remove_args, path=path)
r = Response.redirect(path)
r.headers["Link"] = f"<{path}>; rel=preload"
if self.ds.cors:
add_cors_headers(r.headers)
return r

It uses this method a lot, which has to be over-ridden in the TableView and RowView and QueryView subclasses:

async def data(self, request):
raise NotImplementedError

This method:

async def as_csv(self, request, database):

Is the bulk of the complexity, because it knows how to both turn a list of SQLite rows into a CSV file but also knows how to call .data() repeatedly with different pagination arguments in order to stream CSV back for a large table.

The async def get() method for GET requests is also very complicated. It mainly handles format stuff - knowing how to render HTML v.s. JSON v.s. CSV v.s. other formats specified using this plugin hook: https://docs.datasette.io/en/1.0a2/plugin_hooks.html#register-output-renderer-datasette

Plus it catches interrupted queries and returns a special error page for those (and other error messages too):

except QueryInterrupted as ex:
raise DatasetteError(
textwrap.dedent(
"""
<p>SQL query took too long. The time limit is controlled by the
<a href="https://docs.datasette.io/en/stable/settings.html#sql-time-limit-ms">sql_time_limit_ms</a>
configuration option.</p>
<textarea style="width: 90%">{}</textarea>
<script>
let ta = document.querySelector("textarea");
ta.style.height = ta.scrollHeight + "px";
</script>
""".format(
escape(ex.sql)
)
).strip(),
title="SQL Interrupted",
status=400,
message_is_html=True,
)
except (sqlite3.OperationalError, InvalidSql) as e:
raise DatasetteError(str(e), title="Invalid SQL", status=400)
except sqlite3.OperationalError as e:
raise DatasetteError(str(e))
except DatasetteError:
raise

It adds the time taken to execute the queries:

end = time.perf_counter()
data["query_ms"] = (end - start) * 1000

@simonw
Copy link
Owner Author

simonw commented Apr 6, 2023

There are actually five classes that subclass DataView:

class RowView(DataView):

class DatabaseView(DataView):

class DatabaseDownload(DataView):

class QueryView(DataView):

class TableView(DataView):

I don't think DatabaseView and DatabaseDownload should have subclassed that at all, since they don't return a table of data.

@simonw
Copy link
Owner Author

simonw commented Apr 6, 2023

I've been hoping to move entirely away from class-based views, but now I'm wondering if that's the right decision given the need to support HTTP verbs.

It is cleaner to have a class that has methods for each verb, or to have a single function that can behave differently depending on the verb?

Or should I have a mechanism for dispatching to separate functions based on the verbs that isn't implemented in a base class?

The trouble with using base classes is that I've already shown that they tempt me to put logic in weird places, which makes code harder to modify later on.

The thing here is that the ONLY thing I want to use the base class for is as an HTTP verb switcher - which makes me doubt that I should risk taking on the other temptations of having a base class.

@simonw
Copy link
Owner Author

simonw commented Apr 6, 2023

I asked GPT-4 to compare URL routing based on path and verb approaches in different languages and frameworks and spit out some examples. Full transcript here: https://gist.github.com/simonw/fa2379b97420404a81b0fcdb4db79657

Initial prompt:

Show me code examples of different web frameworks in Python and JavaScript and Go illustrating how HTTP routing works - in particular the problem of mapping an incoming HTTP request to some code based on both the URL path and the HTTP verb

Don't show installation instructions, just get straight to the code snippets

Flask:

from flask import Flask

app = Flask(__name__)

@app.route('/example', methods=['GET'])
def get_example():
    return "This is a GET request."

@app.route('/example', methods=['POST'])
def post_example():
    return "This is a POST request."

if __name__ == '__main__':
    app.run()

Express:

const express = require('express');
const app = express();

app.get('/example', (req, res) => {
  res.send('This is a GET request.');
});

app.post('/example', (req, res) => {
  res.send('This is a POST request.');
});

app.listen(3000, () => {
  console.log('Server is running on port 3000');
});

Gin:

package main

import (
	"github.com/gin-gonic/gin"
)

func main() {
	r := gin.Default()

	r.GET("/example", func(c *gin.Context) {
		c.String(200, "This is a GET request.")
	})

	r.POST("/example", func(c *gin.Context) {
		c.String(200, "This is a POST request.")
	})

	r.Run()
}

Actix-web:

use actix_web::{web, App, HttpResponse, HttpServer, Responder};

async fn get_example() -> impl Responder {
    HttpResponse::Ok().body("This is a GET request.")
}

async fn post_example() -> impl Responder {
    HttpResponse::Ok().body("This is a POST request.")
}

#[actix_web::main]
async fn main() -> std::io::Result<()> {
    HttpServer::new(|| {
        App::new()
            .route("/example", web::get().to(get_example))
            .route("/example", web::post().to(post_example))
    })
    .bind("127.0.0.1:8080")?
    .run()
    .await
}

Koa:

const Koa = require('koa');
const Router = require('koa-router');

const app = new Koa();
const router = new Router();

router.get('/example', (ctx) => {
  ctx.body = 'This is a GET request.';
});

router.post('/example', (ctx) => {
  ctx.body = 'This is a POST request.';
});

app.use(router.routes());
app.use(router.allowedMethods());

app.listen(3000, () => {
  console.log('Server is running on port 3000');
});

Django:

from django.http import HttpResponse
from django.views import View

class ExampleView(View):
    def get(self, request):
        return HttpResponse("This is a GET request.")

    def post(self, request):
        return HttpResponse("This is a POST request.")
from django.urls import path
from .views import ExampleView

urlpatterns = [
    path('example/', ExampleView.as_view(), name='example'),
]

Rocket:

#![feature(proc_macro_hygiene, decl_macro)]

#[macro_use] extern crate rocket;

use rocket::response::content;

#[get("/example")]
fn get_example() -> content::Plain<&'static str> {
    content::Plain("This is a GET request.")
}

#[post("/example")]
fn post_example() -> content::Plain<&'static str> {
    content::Plain("This is a POST request.")
}

#[launch]
fn rocket() -> _ {
    rocket::build().mount("/", routes![get_example, post_example])
}

Elixir/Phoenix:

defmodule ExampleAppWeb.Router do
  use ExampleAppWeb, :router

  pipeline :api do
    plug :accepts, ["json"]
  end

  scope "/api", ExampleAppWeb do
    pipe_through :api

    get "/example", ExampleController, :get_example
    post "/example", ExampleController, :post_example
  end
end

Bottle:

from bottle import Bottle, run

app = Bottle()

@app.route('/example', method='GET')
def get_example():
    return "This is a GET request."

@app.route('/example', method='POST')
def post_example():
    return "This is a POST request."

if __name__ == '__main__':
    run(app, host='localhost', port=8080)

FastAPI:

from fastapi import FastAPI

app = FastAPI()

@app.get("/example")
async def get_example():
    return {"message": "This is a GET request."}

@app.post("/example")
async def post_example():
    return {"message": "This is a POST request."}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="127.0.0.1", port=8000)

@simonw
Copy link
Owner Author

simonw commented Apr 6, 2023

Turned this into a TIL: https://til.simonwillison.net/gpt3/gpt4-api-design

@simonw
Copy link
Owner Author

simonw commented Apr 6, 2023

I think there are two main approaches:

  • Map path to a function, that function then behaves differently on GET/POST/etc
  • Map method AND path to a function - the app.get('/example') v.s. app.post('/example') ones

The decorator approach is calling out to me a bit now.

@simonw
Copy link
Owner Author

simonw commented Apr 6, 2023

I'm contemplating a new approach: using a class with static methods. Something like this:

class TableView(MethodRouter):
    @staticmethod
    async def get(request):
        return Response.text("GET")

    @staticmethod
    async def post(request):
        return Response.text("POST")

So effectively the class is just there to bundle together verb implementations, and to provide a route(request) method which knows how to dispatch them to the right place.

It can offer default HEAD and OPTIONS methods too.

@simonw
Copy link
Owner Author

simonw commented Apr 6, 2023

I actually quite like that. I could even use @classmethod and have utility methods defined on that class that both get() and post() could call.

The crucial rule here is NO INSTANCE STATE - that's what makes routing to classes particularly damaging, and encourages code that's hard to maintain.

@dsisnero
Copy link

dsisnero commented Apr 7, 2023

you should have a look at Roda written in ruby .

@simonw
Copy link
Owner Author

simonw commented Apr 7, 2023

Ooh that one's really interesting - very different from the others:

# app.rb
require "roda"

class App < Roda
  route do |r|
    r.root do
      "Home page"
    end

    r.on "pages" do
      r.get ":slug" do |slug|
        "Page: #{slug}"
      end
    end

    r.on "news" do
      r.get ":yyyy/:mm/:dd" do |yyyy, mm, dd|
        "News for #{yyyy}/#{mm}/#{dd}"
      end
    end
  end
end

# config.ru
require_relative "app"
run App.freeze.app

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

2 participants