diff --git a/content/blog/codegen-learnings/flow.png b/content/blog/codegen-learnings/flow.png new file mode 100644 index 000000000000..da4173678736 Binary files /dev/null and b/content/blog/codegen-learnings/flow.png differ diff --git a/content/blog/codegen-learnings/flow.tldr b/content/blog/codegen-learnings/flow.tldr new file mode 100644 index 000000000000..a0bbc8ded9c9 --- /dev/null +++ b/content/blog/codegen-learnings/flow.tldr @@ -0,0 +1,267 @@ +{ + "tldrawFileFormatVersion": 1, + "schema": { + "schemaVersion": 2, + "sequences": { + "com.tldraw.store": 4, + "com.tldraw.asset": 1, + "com.tldraw.camera": 1, + "com.tldraw.document": 2, + "com.tldraw.instance": 25, + "com.tldraw.instance_page_state": 5, + "com.tldraw.page": 1, + "com.tldraw.instance_presence": 6, + "com.tldraw.pointer": 1, + "com.tldraw.shape": 4, + "com.tldraw.asset.bookmark": 2, + "com.tldraw.asset.image": 5, + "com.tldraw.asset.video": 5, + "com.tldraw.shape.group": 0, + "com.tldraw.shape.text": 2, + "com.tldraw.shape.bookmark": 2, + "com.tldraw.shape.draw": 2, + "com.tldraw.shape.geo": 9, + "com.tldraw.shape.note": 8, + "com.tldraw.shape.line": 5, + "com.tldraw.shape.frame": 0, + "com.tldraw.shape.arrow": 5, + "com.tldraw.shape.highlight": 1, + "com.tldraw.shape.embed": 4, + "com.tldraw.shape.image": 4, + "com.tldraw.shape.video": 2, + "com.tldraw.binding.arrow": 0 + } + }, + "records": [ + { + "gridSize": 10, + "name": "", + "meta": {}, + "id": "document:document", + "typeName": "document" + }, + { + "meta": {}, + "id": "page:page", + "name": "Page 1", + "index": "a1", + "typeName": "page" + }, + { + "id": "pointer:pointer", + "typeName": "pointer", + "x": 357.43359375, + "y": 132.7734375, + "lastActivityTimestamp": 1735267617634, + "meta": {} + }, + { + "followingUserId": null, + "opacityForNextShape": 1, + "stylesForNextShape": { + "tldraw:size": "s", + "tldraw:font": "sans", + "tldraw:geo": "arrow-right" + }, + "brush": null, + "scribbles": [], + "cursor": { + "type": "cross", + "rotation": 0 + }, + "isFocusMode": false, + "exportBackground": true, + "isDebugMode": false, + "isToolLocked": false, + "screenBounds": { + "x": 0, + "y": 0, + "w": 1128, + "h": 978 + }, + "insets": [ + false, + false, + true, + false + ], + "zoomBrush": null, + "isGridMode": false, + "isPenMode": false, + "chatMessage": "", + "isChatting": false, + "highlightedUserIds": [], + "isFocused": true, + "devicePixelRatio": 2, + "isCoarsePointer": false, + "isHoveringCanvas": true, + "openMenus": [], + "isChangingStyle": false, + "isReadonly": false, + "meta": {}, + "duplicateProps": null, + "id": "instance:instance", + "currentPageId": "page:page", + "typeName": "instance" + }, + { + "editingShapeId": null, + "croppingShapeId": null, + "selectedShapeIds": [ + "shape:0vPmbhCIhdgPr7gYnOxm9" + ], + "hoveredShapeId": "shape:0vPmbhCIhdgPr7gYnOxm9", + "erasingShapeIds": [], + "hintingShapeIds": [], + "focusedGroupId": null, + "meta": {}, + "id": "instance_page_state:page:page", + "pageId": "page:page", + "typeName": "instance_page_state" + }, + { + "x": 49.0078125, + "y": 97.7109375, + "z": 1, + "meta": {}, + "id": "camera:page:page", + "typeName": "camera" + }, + { + "x": 40.28125, + "y": 79.8203125, + "rotation": 0, + "isLocked": false, + "opacity": 1, + "meta": {}, + "id": "shape:BJfJwAyLKVoUOGaixU2Xf", + "type": "text", + "props": { + "color": "black", + "size": "s", + "w": 252.359375, + "text": "User query:\n\n\"Generate code for S3 Bucket\"", + "font": "sans", + "textAlign": "start", + "autoSize": true, + "scale": 1 + }, + "parentId": "page:page", + "index": "a1", + "typeName": "shape" + }, + { + "x": 374.58203125, + "y": 54.3125, + "rotation": 0, + "isLocked": false, + "opacity": 1, + "meta": {}, + "id": "shape:z2gtvIBdf6_eCX2gzQ2L1", + "type": "text", + "props": { + "color": "black", + "size": "s", + "w": 310.2265625, + "text": "Search terms:\n\n\"AWS S3 bucket\",\n\"Pulumi AWS S3\",\n\"create S3 bucket Pulumi TypeScript\"", + "font": "sans", + "textAlign": "start", + "autoSize": true, + "scale": 1 + }, + "parentId": "page:page", + "index": "a22v7", + "typeName": "shape" + }, + { + "x": 25.6796875, + "y": 58.140625, + "rotation": 0, + "isLocked": false, + "opacity": 1, + "meta": {}, + "id": "shape:PGJKCoOonmat14xrifIp0", + "type": "geo", + "props": { + "w": 286.8671875, + "h": 132.9296875, + "geo": "rectangle", + "color": "black", + "labelColor": "black", + "fill": "none", + "dash": "draw", + "size": "s", + "font": "sans", + "text": "", + "align": "middle", + "verticalAlign": "middle", + "growY": 0, + "url": "", + "scale": 1 + }, + "parentId": "page:page", + "index": "a39Cr", + "typeName": "shape" + }, + { + "x": 362.58203125, + "y": 35.85546875000003, + "rotation": 0, + "isLocked": false, + "opacity": 1, + "meta": {}, + "id": "shape:hrkCX2QL0NyXDz_xl3n_i", + "type": "geo", + "props": { + "w": 342.9609375, + "h": 172.41015624999997, + "geo": "rectangle", + "color": "black", + "labelColor": "black", + "fill": "none", + "dash": "draw", + "size": "s", + "font": "sans", + "text": "", + "align": "middle", + "verticalAlign": "middle", + "growY": 0, + "url": "", + "scale": 1 + }, + "parentId": "page:page", + "index": "a40nT", + "typeName": "shape" + }, + { + "x": 322.2734375, + "y": 100.125, + "rotation": 0, + "isLocked": false, + "opacity": 1, + "meta": {}, + "id": "shape:0vPmbhCIhdgPr7gYnOxm9", + "type": "geo", + "props": { + "w": 35.16015625, + "h": 32.6484375, + "geo": "arrow-right", + "color": "black", + "labelColor": "black", + "fill": "none", + "dash": "draw", + "size": "s", + "font": "sans", + "text": "", + "align": "middle", + "verticalAlign": "middle", + "growY": 0, + "url": "", + "scale": 1 + }, + "parentId": "page:page", + "index": "a55Ml", + "typeName": "shape" + } + ] +} \ No newline at end of file diff --git a/content/blog/codegen-learnings/index.md b/content/blog/codegen-learnings/index.md index 9084b265e325..bbaccda18cb0 100644 --- a/content/blog/codegen-learnings/index.md +++ b/content/blog/codegen-learnings/index.md @@ -82,32 +82,16 @@ Because you were looking for the word "pie", you also retrieved a recipe for a S Now let's formalize this a bit. Recall measures the ratio of the relevant documents retrieved to the total number of relevant docuemtns in RAG: -TODO - -old: - -$$Recall = \frac{N(Retrieved\_documents \cap Relevant\_documents)}{N(Relevant\_documents)}$$ - -fixed1: - -$$Recall = \frac{N(\text{Retrieved\_documents} \cap \text{Relevant\_documents})}{N(\text{Relevant\_documents})}$$ - -fixed2: - -$$Recall = \frac{N(Retrieved\text{\_}documents \cap Relevant\text{\_}documents)}{N(Relevant\text{\_}documents)}$$ - -alternative - -$$Recall = \frac{N(\text{RetrievedDocuments} \cap \text{RelevantDocuments})}{N(\text{RelevantDocuments})}$$ +$$Recall = \frac{N(Retrieved \cap Relevant)}{N(Relevant)}$$ Where -- $N(Retrieved\_documents \cap Relevant\_documents)$ is the number of documents that are both retrieved and relevant. -- $N(Relevant\_documents)$ is the total number of relevant documents in the database. +- $N(Retrieved \cap Relevant)$ is the number of documents that are both retrieved and relevant. +- $N(Relevant)$ is the total number of relevant documents in the database. Good recall means that many documents relevant to the query were retrieved. -$$Precision = \frac{N(Retrieved\_documents \cap Relevant\_documents)}{N(Retrieved\_documents)}$$ +$$Precision = \frac{N(Retrieved \cap Relevant)}{N(Retrieved)}$$ Where $N(Retrieved\_documents)$ is the total number of documents that were retrieved. @@ -122,12 +106,25 @@ Precision and recall are essential in understanding the information retrieval qu Fortunately, other metrics that often can effectively estimate retrieval quality have been developed. We have found a metric that can predict, with some degree of accuracy, whether the generated code will successfully compile. For this metric, we compare the _tokens_ present in the prompted produced by the LLM with the number of tokens present in the actually generated code. (By token here we understand a compiler token - an identifier such as the name of a class, method or a field and not a traditional LLM token concept), Intuitively, if a token present in the prompt also appears in the generated program, we can assume that the token effectively contributed to the generated program. Tokens in the generated program that were not part of the prompt are not necessarily wrong but they are less trusted (they can come from the LLM built-in knowledge or were, ahem, hallucinated) -$$prompt \ coverage = \frac{N(Tokens\_in\_prompt \cap Tokens\_in\_code)}{N(Tokens\_in\_code)} $$ +1: +$$prompt \ coverage = \frac{N(Tokens\_in\_prompt \cap Tokens\_in\_code)}{N(Tokens\_in\_code)}$$ + +2: +$$prompt \ coverage = \frac{N(\text{Tokens\_in\_prompt} \cap \text{Tokens\_in\_code})}{N(\text{Tokens\_in\_code})}$$ Prompt coverage is a metric we can observe in production, and it's one of several metrics we use when updating providers to ensure we haven't regressed the quality of the RAG. + + + +
+ Flow of blah +
+
+
+