diff --git a/.nojekyll b/.nojekyll index 3b8427a..c7a0e59 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -3470282d \ No newline at end of file +e4a00f03 \ No newline at end of file diff --git a/labs/lab0-setup.pdf b/labs/lab0-setup.pdf index e0645f0..a208e7a 100644 Binary files a/labs/lab0-setup.pdf and b/labs/lab0-setup.pdf differ diff --git a/labs/lab1-submission.pdf b/labs/lab1-submission.pdf index 11d3e31..f2c2b23 100644 Binary files a/labs/lab1-submission.pdf and b/labs/lab1-submission.pdf differ diff --git a/labs/lab2-testing.pdf b/labs/lab2-testing.pdf index 150b4fb..16ed91e 100644 Binary files a/labs/lab2-testing.pdf and b/labs/lab2-testing.pdf differ diff --git a/labs/lab3-debugging.pdf b/labs/lab3-debugging.pdf index 61157c5..e0ad4a9 100644 Binary files a/labs/lab3-debugging.pdf and b/labs/lab3-debugging.pdf differ diff --git a/ps/ps1.pdf b/ps/ps1.pdf index 0b02447..a6207ff 100644 Binary files a/ps/ps1.pdf and b/ps/ps1.pdf differ diff --git a/ps/ps2.pdf b/ps/ps2.pdf index 4982206..e7322a4 100644 Binary files a/ps/ps2.pdf and b/ps/ps2.pdf differ diff --git a/ps/ps3.pdf b/ps/ps3.pdf index 5b0537e..6e8c0fa 100644 Binary files a/ps/ps3.pdf and b/ps/ps3.pdf differ diff --git a/ps/regex.pdf b/ps/regex.pdf index 9c2942d..d1b9e80 100644 Binary files a/ps/regex.pdf and b/ps/regex.pdf differ diff --git a/search.json b/search.json index 09418dd..631303b 100644 --- a/search.json +++ b/search.json @@ -452,7 +452,7 @@ "href": "units/unit5-programming.html#interacting-with-the-operating-system", "title": "Programming concepts", "section": "Interacting with the operating system", - "text": "Interacting with the operating system\nScripting languages allow one to interact with the operating system in various ways. Most allow you to call out to the shell to run arbitrary shell code and save results within your session.\nI’ll assume everyone knows about the following functions/functionality for interacting with the filesystem and file in Python: os.getcwd, os.chdir, import, pickle.dump, pickle.load\nAlso in IPython there is additional functionality/syntax.\nHere are a variety of tools for interacting with the operating system:\n\nTo run UNIX commands from within Python, use subprocess.run(), as follows, noting that we can save the result of a system call to an R object:\n\nimport subprocess, io\nsubprocess.run([\"ls\", \"-al\"]) ## results apparently not shown when compiled...\n\nCompletedProcess(args=['ls', '-al'], returncode=0)\n\nfiles = subprocess.run([\"ls\", \"-al\"], capture_output = True)\nfiles.stdout\n\nb'total 3168\\ndrwxr-sr-x 13 paciorek scfstaff 79 Sep 20 09:36 .\\ndrwxr-sr-x 13 paciorek scfstaff 38 Sep 20 09:19 ..\\n-rw-r--r-- 1 paciorek scfstaff 117142 Sep 7 18:13 chatgpt-regex-numbers.png\\n-rw-r--r-- 1 paciorek scfstaff 176 Aug 29 13:46 debug_code.py\\n-rw-r--r-- 1 paciorek scfstaff 216 Jul 20 08:08 debug_code.py~\\n-rw-r--r-- 1 paciorek scfstaff 42 Aug 29 13:46 dummyfun.py\\n-rw-r--r-- 1 paciorek scfstaff 175396 Aug 29 13:46 exampleGraphic.png\\n-rw-r--r-- 1 paciorek scfstaff 1036183 Aug 29 13:46 file.txt\\n-rw-r--r-- 1 paciorek scfstaff 59 Aug 29 13:46 foo.py\\n-rw-r--r-- 1 paciorek scfstaff 20260 Jul 26 11:45 graph.png\\n-rw-r--r-- 1 paciorek scfstaff 200 Aug 24 08:27 html.tips\\n-rw-r--r-- 1 paciorek scfstaff 45 Aug 24 07:54 html.tips~\\ndrwxr-sr-x 2 paciorek scfstaff 3 Aug 28 16:39 .ipynb_checkpoints\\n-rw-r--r-- 1 paciorek scfstaff 2464 Jul 26 11:45 linked-list.png\\n-rw-r--r-- 1 paciorek scfstaff 98 Aug 29 13:46 local.py\\n-rw-r--r-- 1 paciorek scfstaff 79 Aug 29 13:46 mymod.py\\ndrwxr-sr-x 4 paciorek scfstaff 8 Sep 15 17:26 mypkg\\n-rw-r--r-- 1 paciorek scfstaff 401 Aug 29 13:46 mysqrt.py\\n-rw-r--r-- 1 paciorek scfstaff 63998 Aug 18 11:19 normalized_example.png\\ndrwxr-sr-x 2 paciorek scfstaff 12 Sep 20 09:35 __pycache__\\n-rw-r--r-- 1 paciorek scfstaff 223 Jul 9 14:10 q.py\\n-rw-r--r-- 1 paciorek scfstaff 590 Jul 19 18:19 run_no_break2.py\\n-rw-r--r-- 1 paciorek scfstaff 588 Jul 19 18:14 run_no_break2.py~\\n-rw-r--r-- 1 paciorek scfstaff 381 Jul 19 17:50 run_no_break_full.py~\\n-rw-r--r-- 1 paciorek scfstaff 573 Aug 29 13:46 run_no_break.py\\n-rw-r--r-- 1 paciorek scfstaff 656 Jul 19 17:55 run_no_break.py~\\n-rw-r--r-- 1 paciorek scfstaff 591 Aug 29 13:46 run_with_break.py\\n-rw-r--r-- 1 paciorek scfstaff 656 Jul 19 17:53 run_with_break.py~\\n-rw-r--r-- 1 paciorek scfstaff 385 Aug 29 13:46 stratified.py\\n-rw-r--r-- 1 paciorek scfstaff 586 Jul 19 17:18 stratified.py~\\n-rw-r--r-- 1 paciorek scfstaff 385 Jul 19 17:20 stratified_with_break.py~\\n-rw-r--r-- 1 paciorek scfstaff 33 Jul 19 17:05 test2.py\\n-rw-r--r-- 1 paciorek scfstaff 79 Sep 15 11:46 test3.py\\n-rw-r--r-- 1 paciorek scfstaff 25 Jul 19 17:04 test.py~\\n-rw-r--r-- 1 paciorek scfstaff 404 Aug 29 09:29 test.qmd\\n-rw-r--r-- 1 paciorek scfstaff 354 Aug 28 16:38 test.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 66 Aug 29 13:46 test_scope.py\\n-rw-r--r-- 1 paciorek scfstaff 18 Sep 8 14:57 test.txt\\n-rw-r--r-- 1 paciorek scfstaff 10 Aug 31 08:01 tmp2.txt\\n-rw-r--r-- 1 paciorek scfstaff 2 Aug 25 13:29 tmp3.txt\\n-rw-r--r-- 1 paciorek scfstaff 57 Sep 8 07:53 tmp.qmd\\n-rw-r--r-- 1 paciorek scfstaff 55 Sep 8 07:52 tmp.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 14 Aug 31 08:22 tmp.txt\\n-rw-r--r-- 1 paciorek scfstaff 4 Aug 31 08:01 tmp.txt~\\n-rw-r--r-- 1 paciorek scfstaff 9357 Jul 26 11:45 tree.png\\ndrwxr-sr-x 4 paciorek scfstaff 4 Jul 26 12:26 unit10-linalg_cache\\n-rw-r--r-- 1 paciorek scfstaff 81631 Aug 29 13:46 unit10-linalg.qmd\\n-rw-r--r-- 1 paciorek scfstaff 78863 Jul 5 15:41 unit10-linalg.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 9509 Aug 29 13:46 unit1-intro.md\\n-rw-r--r-- 1 paciorek scfstaff 8908 Jul 26 07:47 unit1-intro.md~\\n-rw-r--r-- 1 paciorek scfstaff 56222 Aug 31 08:00 unit2-dataTech.qmd\\n-rw-r--r-- 1 paciorek scfstaff 52630 Jun 8 13:16 unit2-dataTech.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 18297 Aug 31 09:10 unit3-bash.qmd\\n-rw-r--r-- 1 paciorek scfstaff 12674 Aug 26 12:07 unit3-bash.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 12674 Jul 26 11:50 unit3-bash.Rmd~\\n-rw-r--r-- 1 paciorek scfstaff 3927 Aug 29 13:46 unit3-bash.sh\\n-rw-r--r-- 1 paciorek scfstaff 41222 Sep 6 18:05 unit4-goodPractices.qmd\\n-rw-r--r-- 1 paciorek scfstaff 16432 Jul 18 16:17 unit4-goodPractices.qmd~\\ndrwxr-sr-x 4 paciorek scfstaff 4 Jul 26 12:29 unit5-programming_cache\\ndrwxr-sr-x 4 paciorek scfstaff 4 Sep 20 09:36 unit5-programming_files\\n-rw-r--r-- 1 paciorek scfstaff 310435 Sep 20 09:36 unit5-programming.pdf\\n-rw-r--r-- 1 paciorek scfstaff 125073 Sep 20 09:34 unit5-programming.qmd\\n-rw-r--r-- 1 paciorek scfstaff 125664 Sep 20 09:36 unit5-programming.rmarkdown\\ndrwxr-sr-x 4 paciorek scfstaff 4 Jul 24 17:08 unit6-parallel_cache\\ndrwxr-sr-x 3 paciorek scfstaff 3 Sep 8 15:52 unit6-parallel_files\\n-rw-r--r-- 1 paciorek scfstaff 46622 Sep 8 15:46 unit6-parallel.qmd\\n-rw-r--r-- 1 paciorek scfstaff 45558 Jul 26 09:33 unit6-parallel.qmd~\\ndrwxr-sr-x 4 paciorek scfstaff 4 Jul 27 09:40 unit7-bigData_cache\\n-rw-r--r-- 1 paciorek scfstaff 50238 Aug 29 13:46 unit7-bigData.qmd\\n-rw-r--r-- 1 paciorek scfstaff 69916 Jul 26 17:52 unit7-bigData.qmd~\\ndrwxr-sr-x 4 paciorek scfstaff 4 May 25 17:14 unit8-numbers_cache\\ndrwxr-sr-x 3 paciorek scfstaff 3 Jul 26 12:27 unit8-numbers_files\\n-rw-r--r-- 1 paciorek scfstaff 29633 Aug 29 13:46 unit8-numbers.qmd\\n-rw-r--r-- 1 paciorek scfstaff 29174 May 24 12:22 unit8-numbers.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 42193 Aug 29 13:46 unit9-sim.qmd\\n-rw-r--r-- 1 paciorek scfstaff 42193 Jul 10 10:58 unit9-sim.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 72 Aug 28 16:39 Untitled.ipynb\\n-rw-r--r-- 1 paciorek scfstaff 142 Aug 29 13:46 vec_orig.py\\n-rw-r--r-- 1 paciorek scfstaff 555 Sep 20 09:35 vec.pyc\\n'\n\nwith io.BytesIO(files.stdout) as stream: # create a file-like object\n content = stream.readlines()\ncontent[2:4]\n\n[b'drwxr-sr-x 13 paciorek scfstaff 38 Sep 20 09:19 ..\\n', b'-rw-r--r-- 1 paciorek scfstaff 117142 Sep 7 18:13 chatgpt-regex-numbers.png\\n']\n\n\nThere are also a bunch of functions that will do specific queries of the filesystem, including\n\nos.path.exists(\"unit2-dataTech.qmd\")\n\nTrue\n\nos.listdir(\"../data\")\n\n['coop.txt.gz', 'test.db', 'cpds.csv', 'IPs.RData', 'airline.csv', 'stackoverflow-2016.db', 'airline.parquet', 'hivSequ.csv', 'RTADataSub.csv', 'precip.txt', 'precipData.txt']\n\n\nThere are some tools for dealing with differences between operating systems. os.path.join is a nice example:\n\nos.listdir(os.path.join(\"..\", \"data\"))\n\n['coop.txt.gz', 'test.db', 'cpds.csv', 'IPs.RData', 'airline.csv', 'stackoverflow-2016.db', 'airline.parquet', 'hivSequ.csv', 'RTADataSub.csv', 'precip.txt', 'precipData.txt']\n\n\nIt’s best if you can to write your code, as shown here with os.path.join, in a way that is agnostic to the underlying operating system.\nTo get some info on the system you’re running on:\n\nimport platform\nplatform.system()\n\n'Linux'\n\nos.uname()\n\nposix.uname_result(sysname='Linux', nodename='smeagol', release='5.15.0-73-generic', version='#80-Ubuntu SMP Mon May 15 15:18:26 UTC 2023', machine='x86_64')\n\nplatform.python_version()\n\n'3.11.0'\n\n\nTo retrieve environment variables:\n\nos.environ['PATH']\n\n'/system/linux/mambaforge-3.11/bin:/usr/local/linux/mambaforge-3.11/condabin:/system/linux/mambaforge-3.11/condabin:/system/linux/mambaforge-3.11/bin:/system/linux/mambaforge-3.11/condabin:/system/linux/mambaforge-3.11/bin:/system/linux/julia-1.8.5/bin:/system/linux/mambaforge-3.11/bin:/accounts/vis/paciorek/bin:/system/linux/bin:/usr/local/bin:/usr/bin:/usr/sbin:/usr/lib/rstudio-server/bin:/accounts/vis/paciorek/.local/bin'\n\n\nYou can have an Python script act as a shell script (like running a bash shell script) as follows.\n\nWrite your Python code in a text file, say example.py\nAs the first line of the file, include #!/usr/bin/python (like #!/bin/bash in a bash shell file, as seen in Unit 2) or for more portability across machines, include #!/usr/bin/env python.\nMake the Python code file executable with chmod: chmod ugo+x example.py.\nRun the script from the command line: ./example.py\n\nIf you want to pass arguments into your script, you can do so with the argparse package.\n\nimport argparse\nparser = argparse.ArgumentParser()\nparser.add_argument('-y', '--year', default=2002,\n help='year to download')\nparser.add_argument('-m', '--month', default=None,\n help='month to download')\nargs = parse.parse_args()\nargs.year\nyear = int(args.year)\n\nNow we can run it as follows in the shell:\n\n./example.py 2004 January\n\nUse Ctrl-C to interrupt execution. This will generally back out gracefully, returning you to a state as if the command had not been started. Note that if Python is exceeding the amount of memory available, there can be a long delay. This can be frustrating, particularly since a primary reason you would want to interrupt is when Python runs out of memory." + "text": "Interacting with the operating system\nScripting languages allow one to interact with the operating system in various ways. Most allow you to call out to the shell to run arbitrary shell code and save results within your session.\nI’ll assume everyone knows about the following functions/functionality for interacting with the filesystem and file in Python: os.getcwd, os.chdir, import, pickle.dump, pickle.load\nAlso in IPython there is additional functionality/syntax.\nHere are a variety of tools for interacting with the operating system:\n\nTo run UNIX commands from within Python, use subprocess.run(), as follows, noting that we can save the result of a system call to an R object:\n\nimport subprocess, io\nsubprocess.run([\"ls\", \"-al\"]) ## results apparently not shown when compiled...\n\nCompletedProcess(args=['ls', '-al'], returncode=0)\n\nfiles = subprocess.run([\"ls\", \"-al\"], capture_output = True)\nfiles.stdout\n\nb'total 3168\\ndrwxr-sr-x 13 paciorek scfstaff 79 Sep 22 12:10 .\\ndrwxr-sr-x 13 paciorek scfstaff 38 Sep 22 11:45 ..\\n-rw-r--r-- 1 paciorek scfstaff 117142 Sep 7 18:13 chatgpt-regex-numbers.png\\n-rw-r--r-- 1 paciorek scfstaff 176 Aug 29 13:46 debug_code.py\\n-rw-r--r-- 1 paciorek scfstaff 216 Jul 20 08:08 debug_code.py~\\n-rw-r--r-- 1 paciorek scfstaff 42 Aug 29 13:46 dummyfun.py\\n-rw-r--r-- 1 paciorek scfstaff 175396 Aug 29 13:46 exampleGraphic.png\\n-rw-r--r-- 1 paciorek scfstaff 1036183 Aug 29 13:46 file.txt\\n-rw-r--r-- 1 paciorek scfstaff 59 Aug 29 13:46 foo.py\\n-rw-r--r-- 1 paciorek scfstaff 20260 Jul 26 11:45 graph.png\\n-rw-r--r-- 1 paciorek scfstaff 200 Aug 24 08:27 html.tips\\n-rw-r--r-- 1 paciorek scfstaff 45 Aug 24 07:54 html.tips~\\ndrwxr-sr-x 2 paciorek scfstaff 3 Aug 28 16:39 .ipynb_checkpoints\\n-rw-r--r-- 1 paciorek scfstaff 2464 Jul 26 11:45 linked-list.png\\n-rw-r--r-- 1 paciorek scfstaff 98 Sep 21 11:37 local.py\\n-rw-r--r-- 1 paciorek scfstaff 79 Aug 29 13:46 mymod.py\\ndrwxr-sr-x 4 paciorek scfstaff 8 Sep 15 17:26 mypkg\\n-rw-r--r-- 1 paciorek scfstaff 401 Aug 29 13:46 mysqrt.py\\n-rw-r--r-- 1 paciorek scfstaff 63998 Aug 18 11:19 normalized_example.png\\ndrwxr-sr-x 2 paciorek scfstaff 13 Sep 22 12:10 __pycache__\\n-rw-r--r-- 1 paciorek scfstaff 223 Jul 9 14:10 q.py\\n-rw-r--r-- 1 paciorek scfstaff 590 Jul 19 18:19 run_no_break2.py\\n-rw-r--r-- 1 paciorek scfstaff 588 Jul 19 18:14 run_no_break2.py~\\n-rw-r--r-- 1 paciorek scfstaff 381 Jul 19 17:50 run_no_break_full.py~\\n-rw-r--r-- 1 paciorek scfstaff 573 Aug 29 13:46 run_no_break.py\\n-rw-r--r-- 1 paciorek scfstaff 656 Jul 19 17:55 run_no_break.py~\\n-rw-r--r-- 1 paciorek scfstaff 591 Aug 29 13:46 run_with_break.py\\n-rw-r--r-- 1 paciorek scfstaff 656 Jul 19 17:53 run_with_break.py~\\n-rw-r--r-- 1 paciorek scfstaff 385 Aug 29 13:46 stratified.py\\n-rw-r--r-- 1 paciorek scfstaff 586 Jul 19 17:18 stratified.py~\\n-rw-r--r-- 1 paciorek scfstaff 385 Jul 19 17:20 stratified_with_break.py~\\n-rw-r--r-- 1 paciorek scfstaff 33 Jul 19 17:05 test2.py\\n-rw-r--r-- 1 paciorek scfstaff 79 Sep 15 11:46 test3.py\\n-rw-r--r-- 1 paciorek scfstaff 25 Jul 19 17:04 test.py~\\n-rw-r--r-- 1 paciorek scfstaff 404 Aug 29 09:29 test.qmd\\n-rw-r--r-- 1 paciorek scfstaff 354 Aug 28 16:38 test.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 66 Aug 29 13:46 test_scope.py\\n-rw-r--r-- 1 paciorek scfstaff 18 Sep 8 14:57 test.txt\\n-rw-r--r-- 1 paciorek scfstaff 10 Aug 31 08:01 tmp2.txt\\n-rw-r--r-- 1 paciorek scfstaff 2 Aug 25 13:29 tmp3.txt\\n-rw-r--r-- 1 paciorek scfstaff 57 Sep 8 07:53 tmp.qmd\\n-rw-r--r-- 1 paciorek scfstaff 55 Sep 8 07:52 tmp.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 14 Aug 31 08:22 tmp.txt\\n-rw-r--r-- 1 paciorek scfstaff 4 Aug 31 08:01 tmp.txt~\\n-rw-r--r-- 1 paciorek scfstaff 9357 Jul 26 11:45 tree.png\\ndrwxr-sr-x 4 paciorek scfstaff 4 Jul 26 12:26 unit10-linalg_cache\\n-rw-r--r-- 1 paciorek scfstaff 81631 Aug 29 13:46 unit10-linalg.qmd\\n-rw-r--r-- 1 paciorek scfstaff 78863 Jul 5 15:41 unit10-linalg.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 9509 Aug 29 13:46 unit1-intro.md\\n-rw-r--r-- 1 paciorek scfstaff 8908 Jul 26 07:47 unit1-intro.md~\\n-rw-r--r-- 1 paciorek scfstaff 56222 Aug 31 08:00 unit2-dataTech.qmd\\n-rw-r--r-- 1 paciorek scfstaff 52630 Jun 8 13:16 unit2-dataTech.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 18297 Aug 31 09:10 unit3-bash.qmd\\n-rw-r--r-- 1 paciorek scfstaff 12674 Aug 26 12:07 unit3-bash.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 12674 Jul 26 11:50 unit3-bash.Rmd~\\n-rw-r--r-- 1 paciorek scfstaff 3927 Aug 29 13:46 unit3-bash.sh\\n-rw-r--r-- 1 paciorek scfstaff 41222 Sep 6 18:05 unit4-goodPractices.qmd\\n-rw-r--r-- 1 paciorek scfstaff 16432 Jul 18 16:17 unit4-goodPractices.qmd~\\ndrwxr-sr-x 4 paciorek scfstaff 4 Jul 26 12:29 unit5-programming_cache\\ndrwxr-sr-x 4 paciorek scfstaff 4 Sep 22 12:10 unit5-programming_files\\n-rw-r--r-- 1 paciorek scfstaff 310667 Sep 22 12:10 unit5-programming.pdf\\n-rw-r--r-- 1 paciorek scfstaff 125504 Sep 22 12:09 unit5-programming.qmd\\n-rw-r--r-- 1 paciorek scfstaff 126099 Sep 22 12:10 unit5-programming.rmarkdown\\ndrwxr-sr-x 4 paciorek scfstaff 4 Jul 24 17:08 unit6-parallel_cache\\ndrwxr-sr-x 3 paciorek scfstaff 3 Sep 8 15:52 unit6-parallel_files\\n-rw-r--r-- 1 paciorek scfstaff 46622 Sep 8 15:46 unit6-parallel.qmd\\n-rw-r--r-- 1 paciorek scfstaff 45558 Jul 26 09:33 unit6-parallel.qmd~\\ndrwxr-sr-x 4 paciorek scfstaff 4 Jul 27 09:40 unit7-bigData_cache\\n-rw-r--r-- 1 paciorek scfstaff 50238 Aug 29 13:46 unit7-bigData.qmd\\n-rw-r--r-- 1 paciorek scfstaff 69916 Jul 26 17:52 unit7-bigData.qmd~\\ndrwxr-sr-x 4 paciorek scfstaff 4 May 25 17:14 unit8-numbers_cache\\ndrwxr-sr-x 3 paciorek scfstaff 3 Jul 26 12:27 unit8-numbers_files\\n-rw-r--r-- 1 paciorek scfstaff 29633 Aug 29 13:46 unit8-numbers.qmd\\n-rw-r--r-- 1 paciorek scfstaff 29174 May 24 12:22 unit8-numbers.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 42193 Aug 29 13:46 unit9-sim.qmd\\n-rw-r--r-- 1 paciorek scfstaff 42193 Jul 10 10:58 unit9-sim.qmd~\\n-rw-r--r-- 1 paciorek scfstaff 72 Aug 28 16:39 Untitled.ipynb\\n-rw-r--r-- 1 paciorek scfstaff 142 Aug 29 13:46 vec_orig.py\\n-rw-r--r-- 1 paciorek scfstaff 555 Sep 22 12:10 vec.pyc\\n'\n\nwith io.BytesIO(files.stdout) as stream: # create a file-like object\n content = stream.readlines()\ncontent[2:4]\n\n[b'drwxr-sr-x 13 paciorek scfstaff 38 Sep 22 11:45 ..\\n', b'-rw-r--r-- 1 paciorek scfstaff 117142 Sep 7 18:13 chatgpt-regex-numbers.png\\n']\n\n\nThere are also a bunch of functions that will do specific queries of the filesystem, including\n\nos.path.exists(\"unit2-dataTech.qmd\")\n\nTrue\n\nos.listdir(\"../data\")\n\n['coop.txt.gz', 'test.db', 'cpds.csv', 'IPs.RData', 'airline.csv', 'stackoverflow-2016.db', 'airline.parquet', 'hivSequ.csv', 'RTADataSub.csv', 'precip.txt', 'precipData.txt']\n\n\nThere are some tools for dealing with differences between operating systems. os.path.join is a nice example:\n\nos.listdir(os.path.join(\"..\", \"data\"))\n\n['coop.txt.gz', 'test.db', 'cpds.csv', 'IPs.RData', 'airline.csv', 'stackoverflow-2016.db', 'airline.parquet', 'hivSequ.csv', 'RTADataSub.csv', 'precip.txt', 'precipData.txt']\n\n\nIt’s best if you can to write your code, as shown here with os.path.join, in a way that is agnostic to the underlying operating system.\nTo get some info on the system you’re running on:\n\nimport platform\nplatform.system()\n\n'Linux'\n\nos.uname()\n\nposix.uname_result(sysname='Linux', nodename='smeagol', release='5.15.0-73-generic', version='#80-Ubuntu SMP Mon May 15 15:18:26 UTC 2023', machine='x86_64')\n\nplatform.python_version()\n\n'3.11.0'\n\n\nTo retrieve environment variables:\n\nos.environ['PATH']\n\n'/system/linux/mambaforge-3.11/bin:/usr/local/linux/mambaforge-3.11/condabin:/system/linux/mambaforge-3.11/condabin:/system/linux/mambaforge-3.11/bin:/system/linux/mambaforge-3.11/condabin:/system/linux/mambaforge-3.11/bin:/system/linux/julia-1.8.5/bin:/system/linux/mambaforge-3.11/bin:/accounts/vis/paciorek/bin:/system/linux/bin:/usr/local/bin:/usr/bin:/usr/sbin:/usr/lib/rstudio-server/bin:/accounts/vis/paciorek/.local/bin'\n\n\nYou can have an Python script act as a shell script (like running a bash shell script) as follows.\n\nWrite your Python code in a text file, say example.py\nAs the first line of the file, include #!/usr/bin/python (like #!/bin/bash in a bash shell file, as seen in Unit 2) or for more portability across machines, include #!/usr/bin/env python.\nMake the Python code file executable with chmod: chmod ugo+x example.py.\nRun the script from the command line: ./example.py\n\nIf you want to pass arguments into your script, you can do so with the argparse package.\n\nimport argparse\nparser = argparse.ArgumentParser()\nparser.add_argument('-y', '--year', default=2002,\n help='year to download')\nparser.add_argument('-m', '--month', default=None,\n help='month to download')\nargs = parse.parse_args()\nargs.year\nyear = int(args.year)\n\nNow we can run it as follows in the shell:\n\n./example.py 2004 January\n\nUse Ctrl-C to interrupt execution. This will generally back out gracefully, returning you to a state as if the command had not been started. Note that if Python is exceeding the amount of memory available, there can be a long delay. This can be frustrating, particularly since a primary reason you would want to interrupt is when Python runs out of memory." }, { "objectID": "units/unit5-programming.html#interacting-with-external-code", @@ -501,7 +501,7 @@ "href": "units/unit5-programming.html#types-and-classes", "title": "Programming concepts", "section": "Types and classes", - "text": "Types and classes\n\nOverview and static vs. dynamic typing\nThe term ‘type’ refers to how a given piece of information is stored and what operations can be done with the information.\n‘Primitive’ types are the most basic types that often relate directly to how data are stored in memory or on disk (e.g., boolean, integer, numeric (real-valued, aka double or floating point), character, pointer (aka address, reference).\nIn compiled languages like C and C++, one has to define the type of each variable. Such languages are statically typed. Interpreted (or scripting) languages such as Python and R have dynamic types. One can associate different types of information with a given variable name at different times and without declaring the type of the variable:\n\nx = 'hello'\nprint(x)\n\nhello\n\nx = 7\nx*3\n\n21\n\n\nIn contrast in a language like C, one has to declare a variable based on its type before using it:\n\ndouble y;\ndouble x = 3.1;\ny = x * 7.1;\n\nDynamic typing can be quite helpful from the perspective of quick implementation and avoiding tedious type definitions and problems from minor inconsistencies between types (e.g., multiplying an integer by a real-valued number). But static typing has some critical advantages from the perspective of software development, including:\n\nprotecting against errors from mismatched values and unexpected user inputs, and\ngenerally much faster execution because the type of a variable does not need to be checked when the code is run.\n\nMore complex types in Python (and in R) often use references (pointers, aka addresses) to the actual locations of the data. We’ll see this in detail when we discuss Memory.\n\n\nTypes in Python\nYou should be familiar with the important built-in data types in Python, most importantly lists, tuples, and dictionaries, as well as basic scalar types such as integers, floats, and strings.\nLet’s look at the type of various built-in data structures in Python and in numpy, which provides important types for numerical computing.\n\nx = 3\ntype(x)\n\n<class 'int'>\n\nx = 3.0\ntype(x)\n\n<class 'float'>\n\nx = 'abc'\ntype(x)\n\n<class 'str'>\n\nx = False\ntype(x)\n\n<class 'bool'>\n\nx = [3, 3.0, 'abc']\ntype(x)\n\n<class 'list'>\n\nimport numpy as np\n\nx = np.array([3, 5, 7]) ## array of integers\ntype(x)\n\n<class 'numpy.ndarray'>\n\ntype(x[0])\n\n<class 'numpy.int64'>\n\nx = np.random.normal(size = 3) # array of floats (aka 'doubles')\ntype(x[0])\n\n<class 'numpy.float64'>\n\nx = np.random.normal(size = (3,4)) # multi-dimensional array\ntype(x)\n\n<class 'numpy.ndarray'>\n\n\nSometimes numpy may modify a type to make things easier for you, which often works well, but you may want to control it yourself to be sure:\n\nx = np.array([3, 5, 7.3])\nx\n\narray([3. , 5. , 7.3])\n\ntype(x[0])\n\n<class 'numpy.float64'>\n\nx = np.array([3.0, 5.0, 7.0]) # Force use of floats (either `3.0` or `3.`).\ntype(x[0])\n\n<class 'numpy.float64'>\n\nx = np.array([3, 5, 7], dtype = 'float64')\ntype(x[0])\n\n<class 'numpy.float64'>\n\n\nThis can come up when working on a GPU, where the default is usually 32-bit (4-byte) numbers instead of 64-bit (8-byte) numbers. ### Composite objects\nMany objects can be composite (e.g., a list of dictionaries or a dictionary of lists, tuples, and strings).\n\nmydict = {'a': 3, 'b': 7}\nmylist = [3, 5, 7]\n\nmylist[1] = mydict\nmylist\n\n[3, {'a': 3, 'b': 7}, 7]\n\nmydict['a'] = mylist\n\n\n\nMutable objects\nMost objects in Python can be modified in place (i.e., modifying only some of the object), but tuples, strings, and sets are immutable:\n\nx = (3,5,7)\ntry:\n x[1] = 4\nexcept Exception as error:\n print(error)\n\n'tuple' object does not support item assignment\n\ns = 'abc'\ns[1]\n\n'b'\n\ntry:\n s[1] = 'y'\nexcept Exception as error:\n print(error)\n\n'str' object does not support item assignment\n\n\n\n\nConverting between types\nThis also goes by the term coercion and casting. Casting often needs to be done explicitly in compiled languages and somewhat less so in interpreted languages like Python.\nWe can cast (coerce) between different basic types:\n\ny = str(x[0])\ny\n\n'3'\n\ny = int(x[0])\ntype(y)\n\n<class 'int'>\n\n\nSome common conversions are converting numbers that are being interpreted as strings into actual numbers and converting between booleans and numeric values.\nIn some cases Python will automatically do conversions behind the scenes in a smart way (or occasionally not so smart way). Consider these attempts/examples of implicit coercion:\n\nx = np.array([False, True, True])\nx.sum() # What do you think is going to happen?\n\n2\n\nx = np.random.normal(size = 5)\ntry:\n x[3] = 'hat' # What do you think is going to happen?\nexcept Exception as error:\n print(error)\n \n\ncould not convert string to float: 'hat'\n\nmyArray = [1, 3, 5, 9, 4, 7]\n# myArray[2.0] # What do you think is going to happen?\n# myArray[2.73] # What do you think is going to happen?\n\nR is less strict and will do conversions in some cases that Python won’t:\n\nx <- rnorm(5)\nx[2.0]\n\n[1] 1.50022\n\nx[2.73]\n\n[1] 1.50022\n\n\nWhat are the advantages and disadvantages of the different behaviors of Python and R?\n\n\nDataframes\nHopefully you’re also familiar with the Pandas dataframe type.\nPandas picked up the idea of dataframes from R and functionality is similar in many ways to what you can do with R’s dplyr package.\ndplyr and pandas provide a lot of functionality for the “split-apply-combine” framework of working with “rectangular” data.\nOften analyses are done in a stratified fashion - the same operation or analysis is done on subsets of the data set. The subsets might be different time points, different locations, different hospitals, different people, etc.\nThe split-apply-combine framework is intended to operate in this kind of context: - first one splits the dataset by one or more variables, - then one does something to each subset, and - then one combines the results.\nsplit-apply-combine is also closely related to the famous Map-Reduce framework underlying big data tools such as Hadoop and Spark.\nIt’s also very similar to standard SQL queries involving filtering, grouping, and aggregation.\n\n\nPython object protocols\nThere are a number of broad categories of kinds of objects: mapping, number, sequence, iterator. These are called object protocols.\nAll objects that fall in a given category share key characteristics. For example sequence objects have a notion of “next”, while iterator objects have a notion of “stopping”.\nIf you implement your own class that falls into one of these categories, it should follow the relevant protocol by providing the required methods. For example a container class that supports iteration should provide the __iter__ and __next__ methods.\nHere we see that tuples are iterable containers:\n\nmytuple = (\"apple\", \"banana\", \"cherry\")\n\nfor item in mytuple:\n print(item)\n\napple\nbanana\ncherry\n\nmyit = iter(mytuple)\n\nprint(next(myit))\n\napple\n\nprint(next(myit))\n\nbanana\n\nmyit.__next__()\n\n'cherry'\n\nx = zip(['clinton', 'bush', 'obama', 'trump'], ['Dem', 'Rep', 'Dem', 'Rep'])\nnext(x)\n\n('clinton', 'Dem')\n\nnext(x)\n\n('bush', 'Rep')\n\n\nWe can also go from an iterable object to a standard list:\n\nr = range(5)\nr\n\nrange(0, 5)\n\nlist(r)\n\n[0, 1, 2, 3, 4]" + "text": "Types and classes\n\nOverview and static vs. dynamic typing\nThe term ‘type’ refers to how a given piece of information is stored and what operations can be done with the information.\n‘Primitive’ types are the most basic types that often relate directly to how data are stored in memory or on disk (e.g., boolean, integer, numeric (real-valued, aka double or floating point), character, pointer (aka address, reference).\nIn compiled languages like C and C++, one has to define the type of each variable. Such languages are statically typed. Interpreted (or scripting) languages such as Python and R have dynamic types. One can associate different types of information with a given variable name at different times and without declaring the type of the variable:\n\nx = 'hello'\nprint(x)\n\nhello\n\nx = 7\nx*3\n\n21\n\n\nIn contrast in a language like C, one has to declare a variable based on its type before using it:\n\ndouble y;\ndouble x = 3.1;\ny = x * 7.1;\n\nDynamic typing can be quite helpful from the perspective of quick implementation and avoiding tedious type definitions and problems from minor inconsistencies between types (e.g., multiplying an integer by a real-valued number). But static typing has some critical advantages from the perspective of software development, including:\n\nprotecting against errors from mismatched values and unexpected user inputs, and\ngenerally much faster execution because the type of a variable does not need to be checked when the code is run.\n\nMore complex types in Python (and in R) often use references (pointers, aka addresses) to the actual locations of the data. We’ll see this in detail when we discuss Memory.\n\n\nTypes in Python\nYou should be familiar with the important built-in data types in Python, most importantly lists, tuples, and dictionaries, as well as basic scalar types such as integers, floats, and strings.\nLet’s look at the type of various built-in data structures in Python and in numpy, which provides important types for numerical computing.\n\nx = 3\ntype(x)\n\n<class 'int'>\n\nx = 3.0\ntype(x)\n\n<class 'float'>\n\nx = 'abc'\ntype(x)\n\n<class 'str'>\n\nx = False\ntype(x)\n\n<class 'bool'>\n\nx = [3, 3.0, 'abc']\ntype(x)\n\n<class 'list'>\n\nimport numpy as np\n\nx = np.array([3, 5, 7]) ## array of integers\ntype(x)\n\n<class 'numpy.ndarray'>\n\ntype(x[0])\n\n<class 'numpy.int64'>\n\nx = np.random.normal(size = 3) # array of floats (aka 'doubles')\ntype(x[0])\n\n<class 'numpy.float64'>\n\nx = np.random.normal(size = (3,4)) # multi-dimensional array\ntype(x)\n\n<class 'numpy.ndarray'>\n\n\nSometimes numpy may modify a type to make things easier for you, which often works well, but you may want to control it yourself to be sure:\n\nx = np.array([3, 5, 7.3])\nx\n\narray([3. , 5. , 7.3])\n\ntype(x[0])\n\n<class 'numpy.float64'>\n\nx = np.array([3.0, 5.0, 7.0]) # Force use of floats (either `3.0` or `3.`).\ntype(x[0])\n\n<class 'numpy.float64'>\n\nx = np.array([3, 5, 7], dtype = 'float64')\ntype(x[0])\n\n<class 'numpy.float64'>\n\n\nThis can come up when working on a GPU, where the default is usually 32-bit (4-byte) numbers instead of 64-bit (8-byte) numbers. ### Composite objects\nMany objects can be composite (e.g., a list of dictionaries or a dictionary of lists, tuples, and strings).\n\nmydict = {'a': 3, 'b': 7}\nmylist = [3, 5, 7]\n\nmylist[1] = mydict\nmylist\n\n[3, {'a': 3, 'b': 7}, 7]\n\nmydict['a'] = mylist\n\n\n\nMutable objects\nMost objects in Python can be modified in place (i.e., modifying only some of the object), but tuples, strings, and sets are immutable:\n\nx = (3,5,7)\ntry:\n x[1] = 4\nexcept Exception as error:\n print(error)\n\n'tuple' object does not support item assignment\n\ns = 'abc'\ns[1]\n\n'b'\n\ntry:\n s[1] = 'y'\nexcept Exception as error:\n print(error)\n\n'str' object does not support item assignment\n\n\n\n\nConverting between types\nThis also goes by the term coercion and casting. Casting often needs to be done explicitly in compiled languages and somewhat less so in interpreted languages like Python.\nWe can cast (coerce) between different basic types:\n\ny = str(x[0])\ny\n\n'3'\n\ny = int(x[0])\ntype(y)\n\n<class 'int'>\n\n\nSome common conversions are converting numbers that are being interpreted as strings into actual numbers and converting between booleans and numeric values.\nIn some cases Python will automatically do conversions behind the scenes in a smart way (or occasionally not so smart way). Consider these attempts/examples of implicit coercion:\n\nx = np.array([False, True, True])\nx.sum() # What do you think is going to happen?\n\n2\n\nx = np.random.normal(size = 5)\ntry:\n x[3] = 'hat' # What do you think is going to happen?\nexcept Exception as error:\n print(error)\n \n\ncould not convert string to float: 'hat'\n\nmyArray = [1, 3, 5, 9, 4, 7]\n# myArray[2.0] # What do you think is going to happen?\n# myArray[2.73] # What do you think is going to happen?\n\nR is less strict and will do conversions in some cases that Python won’t:\n\nx <- rnorm(5)\nx[2.0]\n\n[1] 0.1552199\n\nx[2.73]\n\n[1] 0.1552199\n\n\nWhat are the advantages and disadvantages of the different behaviors of Python and R?\n\n\nDataframes\nHopefully you’re also familiar with the Pandas dataframe type.\nPandas picked up the idea of dataframes from R and functionality is similar in many ways to what you can do with R’s dplyr package.\ndplyr and pandas provide a lot of functionality for the “split-apply-combine” framework of working with “rectangular” data.\nOften analyses are done in a stratified fashion - the same operation or analysis is done on subsets of the data set. The subsets might be different time points, different locations, different hospitals, different people, etc.\nThe split-apply-combine framework is intended to operate in this kind of context: - first one splits the dataset by one or more variables, - then one does something to each subset, and - then one combines the results.\nsplit-apply-combine is also closely related to the famous Map-Reduce framework underlying big data tools such as Hadoop and Spark.\nIt’s also very similar to standard SQL queries involving filtering, grouping, and aggregation.\n\n\nPython object protocols\nThere are a number of broad categories of kinds of objects: mapping, number, sequence, iterator. These are called object protocols.\nAll objects that fall in a given category share key characteristics. For example sequence objects have a notion of “next”, while iterator objects have a notion of “stopping”.\nIf you implement your own class that falls into one of these categories, it should follow the relevant protocol by providing the required methods. For example a container class that supports iteration should provide the __iter__ and __next__ methods.\nHere we see that tuples are iterable containers:\n\nmytuple = (\"apple\", \"banana\", \"cherry\")\n\nfor item in mytuple:\n print(item)\n\napple\nbanana\ncherry\n\nmyit = iter(mytuple)\n\nprint(next(myit))\n\napple\n\nprint(next(myit))\n\nbanana\n\nmyit.__next__()\n\n'cherry'\n\nx = zip(['clinton', 'bush', 'obama', 'trump'], ['Dem', 'Rep', 'Dem', 'Rep'])\nnext(x)\n\n('clinton', 'Dem')\n\nnext(x)\n\n('bush', 'Rep')\n\n\nWe can also go from an iterable object to a standard list:\n\nr = range(5)\nr\n\nrange(0, 5)\n\nlist(r)\n\n[0, 1, 2, 3, 4]" }, { "objectID": "units/unit5-programming.html#principles", @@ -529,7 +529,7 @@ "href": "units/unit5-programming.html#generic-function-oop", "title": "Programming concepts", "section": "Generic function OOP", - "text": "Generic function OOP\nLet’s consider the len function in Python. It seems to work magically on various kinds of objects.\n\n\nx = [3, 5, 7]\nlen(x)\n\n3\n\nx = np.random.normal(size = 5)\nlen(x)\n\n5\n\nx = {'a': 2, 'b': 3}\nlen(x)\n\n2\n\n\nSuppose you were writing the len function. What would you have to do to make it work as it did above? What would happen if a user wants to use len with a class that they define?\nInstead, Python implements the len function by calling the __len__ method of the class that the argument belongs to.\n\nx = {'a': 2, 'b': 3}\nlen(x)\n\n2\n\nx.__len__()\n\n2\n\n\n__len__ is a dunder method (a “Double-UNDERscore” method), which we’ll discuss more in a bit.\nSomething similar occurs with operators:\n\nx = 3\nx + 5\n\n8\n\nx = 'abc'\nx + 'xyz'\n\n'abcxyz'\n\nx.__add__('xyz')\n\n'abcxyz'\n\n\nThis use of generic functions is convenient in that it allows us to work with a variety of kinds of objects using familiar functions.\nThe use of such generic functions and operators is similar in spirit to function or method overloading in C++ and Java. It is also how the (very) old S3 system in R works. And it’s a key part of the (fairly) new Julia language.\n\nWhy use generic functions?\nThe Python developers could have written len as a regular function with a bunch of if statements so that it can handle different kinds of input objects.\nThis has some disadvantages:\n\nWe need to write the code that does the checking.\nFurthermore, all the code for the different cases all lives inside one potentially very long function, unless we create class-specific helper functions.\nMost importantly, len will only work for existing classes. And users can’t easily extend it for new classes that they create because they don’t control the len (built-in) function. So a user could not add the additional conditions/classes in a big if-else statement. The generic function approach makes the system extensible – we can build our own new functionality on top of what is already in Python.\n\n\nThe print function\nLike len, print is a generic function, with various class-specific methods.\nWe can write a print method for our own class by defining the __str__ method as well as a __repr__ method giving what to display when the name of an object is typed.\n\nclass Bear:\n def __init__(self, name, age):\n self.name = name\n self.age = age\n\nyog = Bear(\"Yogi the Bear\", 23)\nprint(yog)\n\n<__main__.Bear object at 0x7f9cc6ce4b10>\n\nclass Bear:\n def __init__(self, name, age):\n self.name = name\n self.age = age\n def __str__(self):\n return f\"A bear named {self.name} of age {self.age}.\"\n def __repr__(self):\n return f\"Bear(name={self.name}, age={self.age})\"\n\nyog = Bear(\"Yogi the Bear\", 23)\nprint(yog) # Invokes __str__\n\nA bear named Yogi the Bear of age 23.\n\nyog # Invokes __repr__\n\nBear(name=Yogi the Bear, age=23)\n\n\n\n\n\nMultiple dispatch OOP\nThe dispatch system involved in len and + involves only the first argument to the function (or operator). In contrast, Julia emphasizes the importance of multiple dispatch as particularly important for mathematical computation. With multiple dispatch, the specific method can be chosen based on more than one argument.\nIn R, the old (but still used in some contexts) S4 system in R and the new R7 system both provide for multiple dispatch.\nAs a very simple example unrelated to any specific language, multiple dispatch would allow one to do the following with the addition operator:\n3 + 7 # 10\n3 + 'a' # '3a'\n'hi' + ' there' # 'hi there'\nThe idea of having the behavior of an operator or function adapt to the type of the input(s) is one aspect of polymorphism." + "text": "Generic function OOP\nLet’s consider the len function in Python. It seems to work magically on various kinds of objects.\n\n\nx = [3, 5, 7]\nlen(x)\n\n3\n\nx = np.random.normal(size = 5)\nlen(x)\n\n5\n\nx = {'a': 2, 'b': 3}\nlen(x)\n\n2\n\n\nSuppose you were writing the len function. What would you have to do to make it work as it did above? What would happen if a user wants to use len with a class that they define?\nInstead, Python implements the len function by calling the __len__ method of the class that the argument belongs to.\n\nx = {'a': 2, 'b': 3}\nlen(x)\n\n2\n\nx.__len__()\n\n2\n\n\n__len__ is a dunder method (a “Double-UNDERscore” method), which we’ll discuss more in a bit.\nSomething similar occurs with operators:\n\nx = 3\nx + 5\n\n8\n\nx = 'abc'\nx + 'xyz'\n\n'abcxyz'\n\nx.__add__('xyz')\n\n'abcxyz'\n\n\nThis use of generic functions is convenient in that it allows us to work with a variety of kinds of objects using familiar functions.\nThe use of such generic functions and operators is similar in spirit to function or method overloading in C++ and Java. It is also how the (very) old S3 system in R works. And it’s a key part of the (fairly) new Julia language.\n\nWhy use generic functions?\nThe Python developers could have written len as a regular function with a bunch of if statements so that it can handle different kinds of input objects.\nThis has some disadvantages:\n\nWe need to write the code that does the checking.\nFurthermore, all the code for the different cases all lives inside one potentially very long function, unless we create class-specific helper functions.\nMost importantly, len will only work for existing classes. And users can’t easily extend it for new classes that they create because they don’t control the len (built-in) function. So a user could not add the additional conditions/classes in a big if-else statement. The generic function approach makes the system extensible – we can build our own new functionality on top of what is already in Python.\n\n\nThe print function\nLike len, print is a generic function, with various class-specific methods.\nWe can write a print method for our own class by defining the __str__ method as well as a __repr__ method giving what to display when the name of an object is typed.\n\nclass Bear:\n def __init__(self, name, age):\n self.name = name\n self.age = age\n\nyog = Bear(\"Yogi the Bear\", 23)\nprint(yog)\n\n<__main__.Bear object at 0x7fbec7703090>\n\nclass Bear:\n def __init__(self, name, age):\n self.name = name\n self.age = age\n def __str__(self):\n return f\"A bear named {self.name} of age {self.age}.\"\n def __repr__(self):\n return f\"Bear(name={self.name}, age={self.age})\"\n\nyog = Bear(\"Yogi the Bear\", 23)\nprint(yog) # Invokes __str__\n\nA bear named Yogi the Bear of age 23.\n\nyog # Invokes __repr__\n\nBear(name=Yogi the Bear, age=23)\n\n\n\n\n\nMultiple dispatch OOP\nThe dispatch system involved in len and + involves only the first argument to the function (or operator). In contrast, Julia emphasizes the importance of multiple dispatch as particularly important for mathematical computation. With multiple dispatch, the specific method can be chosen based on more than one argument.\nIn R, the old (but still used in some contexts) S4 system in R and the new R7 system both provide for multiple dispatch.\nAs a very simple example unrelated to any specific language, multiple dispatch would allow one to do the following with the addition operator:\n3 + 7 # 10\n3 + 'a' # '3a'\n'hi' + ' there' # 'hi there'\nThe idea of having the behavior of an operator or function adapt to the type of the input(s) is one aspect of polymorphism." }, { "objectID": "units/unit5-programming.html#the-python-object-model-and-dunder-methods.", @@ -564,14 +564,21 @@ "href": "units/unit5-programming.html#pass-by-value-vs.-pass-by-reference", "title": "Programming concepts", "section": "Pass by value vs. pass by reference", - "text": "Pass by value vs. pass by reference\nWhen talking about programming languages, one often distinguishes pass-by-value and pass-by-reference.\nPass-by-value means that when a function is called with one or more arguments, a copy is made of each argument and the function operates on those copies. In pass-by-value, changes to an argument made within a function do not affect the value of the argument in the calling environment.\nPass-by-reference means that the arguments are not copied, but rather that information is passed allowing the function to find and modify the original value of the objects passed into the function. In pass-by-reference changes inside a function can affect the object outside of the function.\nPass-by-value is elegant and modular in that functions do not have side effects - the effect of the function occurs only through the return value of the function. However, it can be inefficient in terms of the amount of computation and of memory used. In contrast, pass-by-reference is more efficient, but also more dangerous and less modular. It’s more difficult to reason about code that uses pass-by-reference because effects of calling a function can be hidden inside the function. Thus pass-by-value is directly related to functional programming.\nArrays and other non-scalar objects in Python are pass-by-reference (but note that tuples are immutable, so one could not modify a tuple that is passed as an argument).\n\ndef myfun(x):\n x[1] = 99\n\ny = [0, 1, 2]\nz = myfun(y)\ntype(z)\n\n<class 'NoneType'>\n\ny\n\n[0, 99, 2]\n\n\nLet’s see what operations cause arguments modified in a function to affect state outside of the function:\n\ndef myfun(f_scalar, f_x, f_x_new, f_x_newid, f_x_copy):\n\n f_scalar = 99 # global input unaffected\n f_x[0] = 99 # global input MODIFIED\n f_x_new = [99,2,3] # global input unaffected\n\n newx = f_x_newid\n newx[0] = 99 # global input MODIFIED\n\n xcopy = f_x_copy.copy() \n xcopy[0] = 99 # global input unaffected\n\n\nscalar = 1\nx = [1,2,3]\nx_new = np.array([1,2,3])\nx_newid = np.array([1,2,3])\nx_copy = np.array([1,2,3])\n\n\nmyfun(scalar, x, x_new, x_newid, x_copy)\n\nHere are the cases where state is preserved:\n\nscalar\n\n1\n\nx_new\n\narray([1, 2, 3])\n\nx_copy\n\narray([1, 2, 3])\n\n\nAnd here are the cases where state is modified:\n\nx\n\n[99, 2, 3]\n\nx_newid\n\narray([99, 2, 3])\n\n\nBasically if you replace the reference (object name) then the state outside the function is preserved. That’s because a new local variable in the function scope is created. However in the ` If you modify part of the object, state is not preserved.\nThe same behavior occurs with other mutable objects such as numpy arrays.\n\nPointers\nTo put pass-by-value vs. pass-by-reference in a broader context, I want to briefly discuss the idea of a pointer, common in compiled languages such as C.\n\nint x = 3;\nint* ptr;\nptr = &x;\n*ptr * 7; // returns 21\n\n\nThe int* declares ptr to be a pointer to (the address of) the integer x.\nThe &x gets the address where x is stored.\n*ptr dereferences ptr, returning the value in that address (which is 3 since ptr is the address of x.\n\nVectors in C are really pointers to a block of memory:\n\nint x[10];\n\nIn this case x will be the address of the first element of the vector. We can access the first element as x[0] or *x.\nWhy have we gone into this? In C, you can pass a pointer as an argument to a function. The result is that only the scalar address is copied and not the entire object, and inside the function, one can modify the original object, with the new value persisting on exit from the function. For example in the following example one passes in the address of an object and that object is then modified in place, affecting its value when the function call finishes.\n\nint myCal(int* ptr){\n *ptr = *ptr + *ptr;\n}\n\nmyCal(&x) # x itself will be modified\n\nSo Python behaves similarly to the use of pointers in C." + "text": "Pass by value vs. pass by reference\nWhen talking about programming languages, one often distinguishes pass-by-value and pass-by-reference.\nPass-by-value means that when a function is called with one or more arguments, a copy is made of each argument and the function operates on those copies. In pass-by-value, changes to an argument made within a function do not affect the value of the argument in the calling environment.\nPass-by-reference means that the arguments are not copied, but rather that information is passed allowing the function to find and modify the original value of the objects passed into the function. In pass-by-reference changes inside a function can affect the object outside of the function.\nPass-by-value is elegant and modular in that functions do not have side effects - the effect of the function occurs only through the return value of the function. However, it can be inefficient in terms of the amount of computation and of memory used. In contrast, pass-by-reference is more efficient, but also more dangerous and less modular. It’s more difficult to reason about code that uses pass-by-reference because effects of calling a function can be hidden inside the function. Thus pass-by-value is directly related to functional programming.\nArrays and other non-scalar objects in Python are pass-by-reference (but note that tuples are immutable, so one could not modify a tuple that is passed as an argument).\n\ndef myfun(x):\n x[1] = 99\n\ny = [0, 1, 2]\nz = myfun(y)\ntype(z)\n\n<class 'NoneType'>\n\ny\n\n[0, 99, 2]\n\n\nLet’s see what operations cause arguments modified in a function to affect state outside of the function:\n\ndef myfun(f_scalar, f_x, f_x_new, f_x_newid, f_x_copy):\n\n f_scalar = 99 # global input unaffected\n f_x[0] = 99 # global input MODIFIED\n f_x_new = [99,2,3] # global input unaffected\n\n newx = f_x_newid\n newx[0] = 99 # global input MODIFIED\n\n xcopy = f_x_copy.copy() \n xcopy[0] = 99 # global input unaffected\n\n\nscalar = 1\nx = [1,2,3]\nx_new = np.array([1,2,3])\nx_newid = np.array([1,2,3])\nx_copy = np.array([1,2,3])\n\n\nmyfun(scalar, x, x_new, x_newid, x_copy)\n\nHere are the cases where state is preserved:\n\nscalar\n\n1\n\nx_new\n\narray([1, 2, 3])\n\nx_copy\n\narray([1, 2, 3])\n\n\nAnd here are the cases where state is modified:\n\nx\n\n[99, 2, 3]\n\nx_newid\n\narray([99, 2, 3])\n\n\nBasically if you replace the reference (object name) then the state outside the function is preserved. That’s because a new local variable in the function scope is created. However in the ` If you modify part of the object, state is not preserved.\nThe same behavior occurs with other mutable objects such as numpy arrays.\n\nPointers\nTo put pass-by-value vs. pass-by-reference in a broader context, I want to briefly discuss the idea of a pointer, common in compiled languages such as C.\n\nint x = 3;\nint* ptr;\nptr = &x;\n*ptr * 7; // returns 21\n\n\nThe int* declares ptr to be a pointer to (the address of) the integer x.\nThe &x gets the address where x is stored.\n*ptr dereferences ptr, returning the value in that address (which is 3 since ptr is the address of x.\n\nArrays in C are really pointers to a block of memory:\n\nint x[10];\n\nIn this case x will be the address of the first element of the vector. We can access the first element as x[0] or *x.\nWhy have we gone into this? In C, you can pass a pointer as an argument to a function. The result is that only the scalar address is copied and not the entire object, and inside the function, one can modify the original object, with the new value persisting on exit from the function. For example in the following example one passes in the address of an object and that object is then modified in place, affecting its value when the function call finishes.\n\nint myCal(int* ptr){\n *ptr = *ptr + *ptr;\n}\n\nmyCal(&x) # x itself will be modified\n\nSo Python behaves similarly to the use of pointers in C." }, { "objectID": "units/unit5-programming.html#namespaces-and-scopes", "href": "units/unit5-programming.html#namespaces-and-scopes", "title": "Programming concepts", "section": "Namespaces and scopes", - "text": "Namespaces and scopes\nAs discussed here in the Python docs, a namespace is a mapping from names to objects that allows Python to find objects by name via clear rules that enforce modularity and avoid name conflicts.\nNamespaces are created and removed through the course of executing Python code. When a function is run, a namespace for the local variables in the function is created, and then deleted when the function finishes executing. Separate function calls (including recursive calls) have separate namespaces.\nScope is closely related concept – a scope determines what namespaces are accessible from a given place in one’s code. Scopes are nested and determine where and in what order Python searches the various namespaces for objects.\nNote that the ideas of namespaces and scopes are relevant in most other languages, though the details of how they work can differ.\nThese ideas are very important for modularity, isolating the names of objects to avoid conflicts.\nThis allows you to use the same name in different modules or submodules, as well as different packages using the same name.\nOf course to make the objects in a module or package available we need to use import.\nConsider what happens if you have two modules that both use x and you import x using from.\n\nfrom mypkg.mymod import x\nfrom mypkg.mysubpkg import x\nx # which x is used?\n\nWe’ve added x twice to the namespace of the global scope. Are both available? Did one ‘overwrite’ the other? How do I access the other one?\nThis is much better:\n\nimport mypkg\nmypkg.x\n\n7\n\nimport mypkg.mysubpkg\nmypkg.mysubpkg.x\n\n999\n\n\nSide note: notice that import mypkg causes the name mypkg itself to be in the current (global) scope.\nWe can see the objects in a given namespace/scope using dir().\n\nxyz = 7\ndir()\n\n['Bear', 'GrizzlyBear', '__annotations__', '__builtins__', '__doc__', '__loader__', '__name__', '__package__', '__spec__', 'add', 'apply_fun', 'content', 'copy', 'f', 'files', 'foo', 'function', 'function_a', 'function_b', 'function_c', 'io', 'it', 'item', 'm', 'match', 'math', 'myArray', 'mydict', 'myfun', 'myit', 'mylist', 'mymod', 'mypkg', 'myts', 'myts2', 'mytsFullCopy', 'mytsRef', 'mytuple', 'new_x', 'np', 'num399', 'os', 'out1', 'out2', 'partial', 'pattern', 'platform', 'plt', 'r', 're', 'result', 'return_group', 'round3', 's', 'scalar', 'smoke', 'stream', 'subprocess', 'sum_args', 'sys', 'text', 'time', 'tmp', 'traceback', 'tsSimClass', 'x', 'x_copy', 'x_new', 'x_newid', 'xyz', 'y', 'y1', 'y2', 'y3', 'yog', 'z']\n\nimport mypkg\ndir(mypkg)\n\n['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'myfun', 'mymod', 'mysubpkg', 'x']\n\nimport mypkg.mymod\ndir(mypkg.mymod)\n\n['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'myfun', 'x']\n\nimport builtins\ndir(builtins)\n\n['ArithmeticError', 'AssertionError', 'AttributeError', 'BaseException', 'BaseExceptionGroup', 'BlockingIOError', 'BrokenPipeError', 'BufferError', 'BytesWarning', 'ChildProcessError', 'ConnectionAbortedError', 'ConnectionError', 'ConnectionRefusedError', 'ConnectionResetError', 'DeprecationWarning', 'EOFError', 'Ellipsis', 'EncodingWarning', 'EnvironmentError', 'Exception', 'ExceptionGroup', 'False', 'FileExistsError', 'FileNotFoundError', 'FloatingPointError', 'FutureWarning', 'GeneratorExit', 'IOError', 'ImportError', 'ImportWarning', 'IndentationError', 'IndexError', 'InterruptedError', 'IsADirectoryError', 'KeyError', 'KeyboardInterrupt', 'LookupError', 'MemoryError', 'ModuleNotFoundError', 'NameError', 'None', 'NotADirectoryError', 'NotImplemented', 'NotImplementedError', 'OSError', 'OverflowError', 'PendingDeprecationWarning', 'PermissionError', 'ProcessLookupError', 'RecursionError', 'ReferenceError', 'ResourceWarning', 'RuntimeError', 'RuntimeWarning', 'StopAsyncIteration', 'StopIteration', 'SyntaxError', 'SyntaxWarning', 'SystemError', 'SystemExit', 'TabError', 'TimeoutError', 'True', 'TypeError', 'UnboundLocalError', 'UnicodeDecodeError', 'UnicodeEncodeError', 'UnicodeError', 'UnicodeTranslateError', 'UnicodeWarning', 'UserWarning', 'ValueError', 'Warning', 'ZeroDivisionError', '_', '__build_class__', '__debug__', '__doc__', '__import__', '__loader__', '__name__', '__package__', '__spec__', 'abs', 'aiter', 'all', 'anext', 'any', 'ascii', 'bin', 'bool', 'breakpoint', 'bytearray', 'bytes', 'callable', 'chr', 'classmethod', 'compile', 'complex', 'copyright', 'credits', 'delattr', 'dict', 'dir', 'divmod', 'enumerate', 'eval', 'exec', 'exit', 'filter', 'float', 'format', 'frozenset', 'getattr', 'globals', 'hasattr', 'hash', 'help', 'hex', 'id', 'input', 'int', 'isinstance', 'issubclass', 'iter', 'len', 'license', 'list', 'locals', 'map', 'max', 'memoryview', 'min', 'next', 'object', 'oct', 'open', 'ord', 'pow', 'print', 'property', 'quit', 'range', 'repr', 'reversed', 'round', 'set', 'setattr', 'slice', 'sorted', 'staticmethod', 'str', 'sum', 'super', 'tuple', 'type', 'vars', 'zip']\n\n\nHere are the key scopes to be aware of, in order (“LEGB”) of how the namespaces are searched:\n\nLocal scope: objects available within function (or class method).\nnon-local (Enclosing) scope: objects available from functions enclosing a given function (we’ll talk about this more later; this relates to lexical scoping).\nGlobal (aka ‘module’) scope: objects available in the module in which the function is defined (which may simply be the default global scope when you start the Python interpreter). This is also the local scope if the code is not executing inside a function.\nBuilt-ins scope: objects provided by Python through the built-ins module but available from anywhere.\n\nNote that import adds the name of the imported module in the local scope.\nWe can see the local and global namespaces using locals() and globals().\n\ncat local.py\n\ngx = 7\n\ndef myfun(z):\n y = z*3\n print(\"local: \", locals())\n print(\"global: \", globals())\n\n\nRun the following code to see what is in the different namespaces:\n\nimport local\n\ngx = 99\nlocal.myfun(3)\n\nStrangely (for me being more used to R, where package namespaces are locked), we can add an object to a namespace created from a module or package:\n\nmymod.x = 33\ndir(mymod)\n\n['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'myfun', 'range', 'x']\n\nimport numpy as np\nnp.x = 33\n'x' in dir(np)\n\nTrue\n\n\nAs more motivation, consider this example.\nSuppose we have this code in a module named test_scope.py:\n\ncat test_scope.py\n\nmagic_number = 7\n\ndef myfun(val):\n return(val * magic_number)\n\n\nNow suppose we also define magic_number in the scope in which myfun is called from.\n\nimport test_scope\nmagic_number = 900\ntest_scope.myfun(3)\n\n21\n\n\nWe see that Python uses magic_number from the module. What would be bad about using magic_number from the global scope of the Python session rather than the global scope of the module? Consider a case where instead of using the test_scope.py module we were using code from a package.\n\nLexical scoping (enclosing scopes)\nIn this section, we seek to understand what happens in the following circumstance. Namely, where does Python get the value for the object x?\n\ndef f(y):\n return(x + y)\n\nf(3)\n\nVariables in the enclosing scope are available within a function. The enclosing scope is the scope in which a function is defined, not the scope from which a function is called.\nThis approach is called lexical scoping. R and many other languages also use lexical scoping.\nThe behavior of basing lookup on where functions are defined rather than where they are called from extends the local-global scoping discussed in the previous section, with similar motivation.\nLet’s dig deeper to understand where Python looks for non-local variables, illustrating lexical scoping:\n\n## Case 1\nx = 3\ndef f2():\n print(x)\n\ndef f():\n x = 7\n f2()\n\nf() # what will happen?\n\n## Case 2\nx = 3\ndef f2()\n print(x)\n\ndef f():\n x = 7\n f2()\n\nx = 100\nf() # what will happen?\n\n## Case 3\nx = 3\ndef f():\n def f2():\n print(x) \n x = 7\n f2()\n\nf() # what will happen?\n\n## Case 4\nx = 3\ndef f():\n def f2():\n print(x)\n f2()\n\nf() # what will happen?\n\nHere’s a tricky example:\n\ny = 100\ndef fun_constructor():\n y = 10\n def g(x):\n return(x + y) \n return(g)\n\n## fun_constructor() creates functions\nmyfun = fun_constructor()\nmyfun(3)\n\n13\n\n\nLet’s work through this:\n\nWhat is the enclosing scope for the function g()?\nWhat does g() use for y?\nWhere is myfun defined (this is tricky)?\nWhat is the enclosing (scope) for myfun()?\nWhen fun_constructor() finishes, does its namespace disappear? What would happen if it did?\nWhat does myfun use for y?\n\n[TODO: is there a way to access variables in a closure (i.e., in this case access ‘y’ based on myfun). Is there a way to find the enclosing scope for ‘myfun’ and look in it? See the inspect package]\nBe careful when using variables from non-local scopes as the value of that variable may well not be what you expect it to be. In general one wants to think carefully before using variables that are taken from outside the local scope, but in some cases it can be useful.\nNext we’ll see some ways of accessing variables outside of the local scope and of enclosing scopes.\n\n\nGlobal and non-local variables\nWe can create and modify global variables and variables in the enclosing scope using global and nonlocal respectively. Note that global is in the context of the current module so this could be a variable in your current Python session if you’re working with functions defined in that session or a global variable in a module or package.\n\ndel x\n\ndef myfun():\n global x\n x = 7\n\nmyfun()\nprint(x)\n\n7\n\nx = 9\nmyfun()\nprint(x)\n\n7\n\n\n\ndef outer_function():\n x = 10 # Outer variable\n def inner_function():\n nonlocal x\n x = 20 # Modifying the outer variable\n print(x) # Output: 10\n inner_function()\n print(x) # Output: 20\n\nouter_function()\n\n10\n20\n\n\nIn R, one can do similar things using the global assignment operator <<-.\n\n\nClosures\nOne way to associate data with functions is to use a closure. This is a functional programming way to achieve something like an OOP class. This Wikipedia entry nicely summarizes the idea, which is a general functional programming idea and not specific to Python.\nUsing a closure involves creating one (or more functions) within a function call and returning the function(s) as the output. When one executes the original function, the new function(s) is created and returned and one can then call that new function(s). The new function then can access objects in the enclosing scope (the scope of the original function) and can use nonlocal to assign into the enclosing scope, to which the function (or the multiple functions) have access. The nice thing about this compared to using a global variable is that the data in the closure is bound up with the function(s) and is protected from being changed by the user.\n\nx = np.random.normal(size = 5)\ndef scaler_constructor(input):\n data = input\n def g(param):\n return(param * data) \n return(g)\n\nscaler = scaler_constructor(x)\ndel x # to demonstrate we no longer need x\nscaler(3)\n\narray([ 0.5081473 , 2.22166935, -2.86110181, -0.79865552, 0.09784364])\n\nscaler(6)\n\narray([ 1.0162946 , 4.44333871, -5.72220361, -1.59731104, 0.19568728])\n\n\nSo calling scaler(3) multiplies 3 by the value of data stored in the closure (the namespace of the enclosing scope) of the function scaler.\nNote that it can be hard to see the memory use involved in the closure.\nHere’s a more realistic example. There are other ways you could do this, but this is slick:\n\ndef make_container(n):\n x = np.zeros(n)\n i = 0\n def store(value = None):\n nonlocal x, i\n if value is None:\n return x\n else:\n x[i] = value\n i += 1\n return store\n\n\nnboot = 20\nbootmeans = make_container(nboot)\n\nimport pandas as pd\niris = pd.read_csv('https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/io/data/csv/iris.csv')\ndata = iris['SepalLength']\n\nfor i in range(nboot): \n bootmeans(np.mean(np.random.choice(data, size = len(data), replace = True)))\n\n\nbootmeans()\n\narray([5.874 , 5.95533333, 5.86 , 5.754 , 5.77466667,\n 5.81733333, 5.902 , 5.79933333, 5.842 , 5.81933333,\n 5.854 , 5.97 , 5.84 , 5.80133333, 5.99333333,\n 5.88133333, 5.83133333, 5.84533333, 5.91466667, 5.79666667])\n\nbootmeans.__closure__\n\n(<cell at 0x7f9cc6b362c0: int object at 0x7f9cd7aef968>, <cell at 0x7f9cc6b36680: numpy.ndarray object at 0x7f9cc6c76d30>)\n\n\n\n\nDecorators\nNow that we’ve seen function generators, it’s straightforward to discuss decorators.\nA decorator is a wrapper around a function that extends the functionality of the function without actually modifying the function.\nWe can create a simple decorator “manually” like this:\n\ndef verbosity_wrapper(myfun):\n def wrapper(*args, **kwargs):\n print(f\"Starting {myfun.__name__}.\")\n output = myfun(*args, **kwargs)\n print(f\"Finishing {myfun.__name__}.\")\n return output\n return wrapper\n \nverbose_rnorm = verbosity_wrapper(np.random.normal)\n\nx = verbose_rnorm(size = 5)\n\nStarting normal.\nFinishing normal.\n\nx\n\narray([ 0.07202449, -0.67672674, 0.98248139, -1.65748762, 0.81795808])\n\n\nPython provides syntax that helps you create decorators with less work (this is an example of the general idea of syntactic sugar).\nWe can easily apply our decorator defined above to a function as follows. Now the function name refers to the wrapped version of the function.\n\n@verbosity_wrapper\ndef myfun(x):\n return x\n\ny = myfun(7)\n\nStarting myfun.\nFinishing myfun.\n\ny\n\n7\n\n\nOur decorator doesn’t do anything useful, but hopefully you can imagine that the idea of being able to have more control over the operation of functions could be useful. For example we could set up a timing wrapper so that when we run a function, we get a report on how long it took to run the function. Or using the idea of a closure, we could keep a running count of the number of times a function has been called.\nOne real-world example of using decorators is in setting up functions to run in parallel in dask, which we’ll discuss in Unit 7." + "text": "Namespaces and scopes\nAs discussed here in the Python docs, a namespace is a mapping from names to objects that allows Python to find objects by name via clear rules that enforce modularity and avoid name conflicts.\nNamespaces are created and removed through the course of executing Python code. When a function is run, a namespace for the local variables in the function is created, and then deleted when the function finishes executing. Separate function calls (including recursive calls) have separate namespaces.\nScope is closely related concept – a scope determines what namespaces are accessible from a given place in one’s code. Scopes are nested and determine where and in what order Python searches the various namespaces for objects.\nNote that the ideas of namespaces and scopes are relevant in most other languages, though the details of how they work can differ.\nThese ideas are very important for modularity, isolating the names of objects to avoid conflicts.\nThis allows you to use the same name in different modules or submodules, as well as different packages using the same name.\nOf course to make the objects in a module or package available we need to use import.\nConsider what happens if you have two modules that both use x and you import x using from.\n\nfrom mypkg.mymod import x\nfrom mypkg.mysubpkg import x\nx # which x is used?\n\nWe’ve added x twice to the namespace of the global scope. Are both available? Did one ‘overwrite’ the other? How do I access the other one?\nThis is much better:\n\nimport mypkg\nmypkg.x\n\n7\n\nimport mypkg.mysubpkg\nmypkg.mysubpkg.x\n\n999\n\n\nSide note: notice that import mypkg causes the name mypkg itself to be in the current (global) scope.\nWe can see the objects in a given namespace/scope using dir().\n\nxyz = 7\ndir()\n\n['Bear', 'GrizzlyBear', '__annotations__', '__builtins__', '__doc__', '__loader__', '__name__', '__package__', '__spec__', 'add', 'apply_fun', 'content', 'copy', 'f', 'files', 'foo', 'function', 'function_a', 'function_b', 'function_c', 'io', 'it', 'item', 'm', 'match', 'math', 'myArray', 'mydict', 'myfun', 'myit', 'mylist', 'mymod', 'mypkg', 'myts', 'myts2', 'mytsFullCopy', 'mytsRef', 'mytuple', 'new_x', 'np', 'num399', 'os', 'out1', 'out2', 'partial', 'pattern', 'platform', 'plt', 'r', 're', 'result', 'return_group', 'round3', 's', 'scalar', 'smoke', 'stream', 'subprocess', 'sum_args', 'sys', 'text', 'time', 'tmp', 'traceback', 'tsSimClass', 'x', 'x_copy', 'x_new', 'x_newid', 'xyz', 'y', 'y1', 'y2', 'y3', 'yog', 'z']\n\nimport mypkg\ndir(mypkg)\n\n['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'myfun', 'mymod', 'mysubpkg', 'x']\n\nimport mypkg.mymod\ndir(mypkg.mymod)\n\n['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'myfun', 'x']\n\nimport builtins\ndir(builtins)\n\n['ArithmeticError', 'AssertionError', 'AttributeError', 'BaseException', 'BaseExceptionGroup', 'BlockingIOError', 'BrokenPipeError', 'BufferError', 'BytesWarning', 'ChildProcessError', 'ConnectionAbortedError', 'ConnectionError', 'ConnectionRefusedError', 'ConnectionResetError', 'DeprecationWarning', 'EOFError', 'Ellipsis', 'EncodingWarning', 'EnvironmentError', 'Exception', 'ExceptionGroup', 'False', 'FileExistsError', 'FileNotFoundError', 'FloatingPointError', 'FutureWarning', 'GeneratorExit', 'IOError', 'ImportError', 'ImportWarning', 'IndentationError', 'IndexError', 'InterruptedError', 'IsADirectoryError', 'KeyError', 'KeyboardInterrupt', 'LookupError', 'MemoryError', 'ModuleNotFoundError', 'NameError', 'None', 'NotADirectoryError', 'NotImplemented', 'NotImplementedError', 'OSError', 'OverflowError', 'PendingDeprecationWarning', 'PermissionError', 'ProcessLookupError', 'RecursionError', 'ReferenceError', 'ResourceWarning', 'RuntimeError', 'RuntimeWarning', 'StopAsyncIteration', 'StopIteration', 'SyntaxError', 'SyntaxWarning', 'SystemError', 'SystemExit', 'TabError', 'TimeoutError', 'True', 'TypeError', 'UnboundLocalError', 'UnicodeDecodeError', 'UnicodeEncodeError', 'UnicodeError', 'UnicodeTranslateError', 'UnicodeWarning', 'UserWarning', 'ValueError', 'Warning', 'ZeroDivisionError', '_', '__build_class__', '__debug__', '__doc__', '__import__', '__loader__', '__name__', '__package__', '__spec__', 'abs', 'aiter', 'all', 'anext', 'any', 'ascii', 'bin', 'bool', 'breakpoint', 'bytearray', 'bytes', 'callable', 'chr', 'classmethod', 'compile', 'complex', 'copyright', 'credits', 'delattr', 'dict', 'dir', 'divmod', 'enumerate', 'eval', 'exec', 'exit', 'filter', 'float', 'format', 'frozenset', 'getattr', 'globals', 'hasattr', 'hash', 'help', 'hex', 'id', 'input', 'int', 'isinstance', 'issubclass', 'iter', 'len', 'license', 'list', 'locals', 'map', 'max', 'memoryview', 'min', 'next', 'object', 'oct', 'open', 'ord', 'pow', 'print', 'property', 'quit', 'range', 'repr', 'reversed', 'round', 'set', 'setattr', 'slice', 'sorted', 'staticmethod', 'str', 'sum', 'super', 'tuple', 'type', 'vars', 'zip']\n\n\nHere are the key scopes to be aware of, in order (“LEGB”) of how the namespaces are searched:\n\nLocal scope: objects available within function (or class method).\nnon-local (Enclosing) scope: objects available from functions enclosing a given function (we’ll talk about this more later; this relates to lexical scoping).\nGlobal (aka ‘module’) scope: objects available in the module in which the function is defined (which may simply be the default global scope when you start the Python interpreter). This is also the local scope if the code is not executing inside a function.\nBuilt-ins scope: objects provided by Python through the built-ins module but available from anywhere.\n\nNote that import adds the name of the imported module to the namespace of the current (i.e., local) scope.\nWe can see the local and global namespaces using locals() and globals().\n\ncat local.py\n\ngx = 7\n\ndef myfun(z):\n y = z*3\n print(\"local: \", locals())\n print(\"global: \", globals())\n\n\nRun the following code to see what is in the different namespaces:\n\nimport local\n\ngx = 99\nlocal.myfun(3)\n\nStrangely (for me being more used to R, where package namespaces are locked), we can add an object to a namespace created from a module or package:\n\nmymod.x = 33\ndir(mymod)\n\n['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'myfun', 'range', 'x']\n\nimport numpy as np\nnp.x = 33\n'x' in dir(np)\n\nTrue\n\n\nAs more motivation, consider this example.\nSuppose we have this code in a module named test_scope.py:\n\ncat test_scope.py\n\nmagic_number = 7\n\ndef myfun(val):\n return(val * magic_number)\n\n\nNow suppose we also define magic_number in the scope in which myfun is called from.\n\nimport test_scope\nmagic_number = 900\ntest_scope.myfun(3)\n\n21\n\n\nWe see that Python uses magic_number from the module. What would be bad about using magic_number from the global scope of the Python session rather than the global scope of the module? Consider a case where instead of using the test_scope.py module we were using code from a package.\n\nLexical scoping (enclosing scopes)\nIn this section, we seek to understand what happens in the following circumstance. Namely, where does Python get the value for the object x?\n\ndef f(y):\n return(x + y)\n\nf(3)\n\nVariables in the enclosing scope are available within a function. The enclosing scope is the scope in which a function is defined, not the scope from which a function is called.\nThis approach is called lexical scoping. R and many other languages also use lexical scoping.\nThe behavior of basing lookup on where functions are defined rather than where they are called from extends the local-global scoping discussed in the previous section, with similar motivation.\nLet’s dig deeper to understand where Python looks for non-local variables, illustrating lexical scoping:\n\n## Case 1\nx = 3\ndef f2():\n print(x)\n\ndef f():\n x = 7\n f2()\n\nf() # what will happen?\n\n## Case 2\nx = 3\ndef f2()\n print(x)\n\ndef f():\n x = 7\n f2()\n\nx = 100\nf() # what will happen?\n\n## Case 3\nx = 3\ndef f():\n def f2():\n print(x) \n x = 7\n f2()\n\nx = 100\nf() # what will happen?\n\n## Case 4\nx = 3\ndef f():\n def f2():\n print(x)\n f2()\n\nx = 100\nf() # what will happen?\n\nHere’s a tricky example:\n\ny = 100\ndef fun_constructor():\n y = 10\n def g(x):\n return(x + y) \n return(g)\n\n## fun_constructor() creates functions\nmyfun = fun_constructor()\nmyfun(3)\n\nLet’s work through this:\n\nWhat is the enclosing scope for the function g()?\nWhich y does g() use?\nWhere is myfun defined (this is tricky – how does myfun relate to g)?\nWhat is the enclosing scope for myfun()?\nWhen fun_constructor() finishes, does its namespace disappear? What would happen if it did?\nWhat does myfun use for y?\n\nWe can use the inspect package to see information about the closure.\n\nimport inspect\ninspect.getclosurevars(myfun)\n\nClosureVars(nonlocals={}, globals={'copy': <module 'copy' from '/system/linux/mambaforge-3.11/lib/python3.11/copy.py'>}, builtins={}, unbound=set())\n\n\n(Note that I haven’t fully investigated the use of inspect, but it looks like it has a lot of useful tools.)\nBe careful when using variables from non-local scopes as the value of that variable may well not be what you expect it to be. In general one wants to think carefully before using variables that are taken from outside the local scope, but in some cases it can be useful.\nNext we’ll see some ways of accessing variables outside of the local scope.\n\n\nGlobal and non-local variables\nWe can create and modify global variables and variables in the enclosing scope using global and nonlocal respectively. Note that global is in the context of the current module so this could be a variable in your current Python session if you’re working with functions defined in that session or a global variable in a module or package.\n\ndel x\n\ndef myfun():\n global x\n x = 7\n\nmyfun()\nprint(x)\n\n7\n\nx = 9\nmyfun()\nprint(x)\n\n7\n\n\n\ndef outer_function():\n x = 10 # Outer variable\n def inner_function():\n nonlocal x\n x = 20 # Modifying the outer variable\n print(x) # Output: 10\n inner_function()\n print(x) # Output: 20\n\nouter_function()\n\n10\n20\n\n\nIn R, one can do similar things using the global assignment operator <<-.\n\n\nClosures\nOne way to associate data with functions is to use a closure. This is a functional programming way to achieve something like an OOP class. This Wikipedia entry nicely summarizes the idea, which is a general functional programming idea and not specific to Python.\nUsing a closure involves creating one (or more functions) within a function call and returning the function(s) as the output. When one executes the original function (the constructor), the new function(s) is created and returned and one can then call that function(s). The function then can access objects in the enclosing scope (the scope of the constructor) and can use nonlocal to assign into the enclosing scope, to which the function (or the multiple functions) have access. The nice thing about this compared to using a global variable is that the data in the closure is bound up with the function(s) and is protected from being changed by the user.\n\nx = np.random.normal(size = 5)\ndef scaler_constructor(input):\n data = input\n def g(param):\n return(param * data) \n return(g)\n\nscaler = scaler_constructor(x)\ndel x # to demonstrate we no longer need x\nscaler(3)\n\narray([ 0.5081473 , 2.22166935, -2.86110181, -0.79865552, 0.09784364])\n\nscaler(6)\n\narray([ 1.0162946 , 4.44333871, -5.72220361, -1.59731104, 0.19568728])\n\n\nSo calling scaler(3) multiplies 3 by the value of data stored in the closure (the namespace of the enclosing scope) of the function scaler.\nNote that it can be hard to see the memory use involved in the closure.\nHere’s a more realistic example. There are other ways you could do this, but this is slick:\n\ndef make_container(n):\n x = np.zeros(n)\n i = 0\n def store(value = None):\n nonlocal x, i\n if value is None:\n return x\n else:\n x[i] = value\n i += 1\n return store\n\n\nnboot = 20\nbootmeans = make_container(nboot)\n\nimport pandas as pd\niris = pd.read_csv('https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/io/data/csv/iris.csv')\ndata = iris['SepalLength']\n\nfor i in range(nboot): \n bootmeans(np.mean(np.random.choice(data, size = len(data), replace = True)))\n\n\nbootmeans()\n\narray([5.874 , 5.95533333, 5.86 , 5.754 , 5.77466667,\n 5.81733333, 5.902 , 5.79933333, 5.842 , 5.81933333,\n 5.854 , 5.97 , 5.84 , 5.80133333, 5.99333333,\n 5.88133333, 5.83133333, 5.84533333, 5.91466667, 5.79666667])\n\nbootmeans.__closure__\n\n(<cell at 0x7fbec758ded0: int object at 0x7fbedc5f3968>, <cell at 0x7fbec758e290: numpy.ndarray object at 0x7fbec76ced30>)" + }, + { + "objectID": "units/unit5-programming.html#decorators", + "href": "units/unit5-programming.html#decorators", + "title": "Programming concepts", + "section": "Decorators", + "text": "Decorators\nNow that we’ve seen function generators, it’s straightforward to discuss decorators.\nA decorator is a wrapper around a function that extends the functionality of the function without actually modifying the function.\nWe can create a simple decorator “manually” like this:\n\ndef verbosity_wrapper(myfun):\n def wrapper(*args, **kwargs):\n print(f\"Starting {myfun.__name__}.\")\n output = myfun(*args, **kwargs)\n print(f\"Finishing {myfun.__name__}.\")\n return output\n return wrapper\n \nverbose_rnorm = verbosity_wrapper(np.random.normal)\n\nx = verbose_rnorm(size = 5)\n\nStarting normal.\nFinishing normal.\n\nx\n\narray([ 0.07202449, -0.67672674, 0.98248139, -1.65748762, 0.81795808])\n\n\nPython provides syntax that helps you create decorators with less work (this is an example of the general idea of syntactic sugar).\nWe can easily apply our decorator defined above to a function as follows. Now the function name refers to the wrapped version of the function.\n\n@verbosity_wrapper\ndef myfun(x):\n return x\n\ny = myfun(7)\n\nStarting myfun.\nFinishing myfun.\n\ny\n\n7\n\n\nOur decorator doesn’t do anything useful, but hopefully you can imagine that the idea of being able to have more control over the operation of functions could be useful. For example we could set up a timing wrapper so that when we run a function, we get a report on how long it took to run the function. Or using the idea of a closure, we could keep a running count of the number of times a function has been called.\nOne real-world example of using decorators is in setting up functions to run in parallel in dask, which we’ll discuss in Unit 7." }, { "objectID": "units/unit5-programming.html#overview-2", @@ -585,56 +592,56 @@ "href": "units/unit5-programming.html#monitoring-memory-use", "title": "Programming concepts", "section": "Monitoring memory use", - "text": "Monitoring memory use\n\nMonitoring overall memory use on a UNIX-style computer\nTo understand how much memory is available on your computer, one needs to have a clear understanding of disk caching. The operating system will generally cache files/data in memory when it reads from disk. Then if that information is still in memory the next time it is needed, it will be much faster to access it the second time around than if it had to read the information from disk. While the cached information is using memory, that same memory is immediately available to other processes, so the memory is available even though it is “in use”.\nWe can see this via free -h (the -h is for ‘human-readable’, i.e. show in GB (G)) on Linux machine.\n total used free shared buff/cache available \n Mem: 251G 998M 221G 2.6G 29G 247G \n Swap: 7.6G 210M 7.4G\nYou’ll generally be interested in the Mem row. (See below for some comments on Swap.) The shared column is complicated and probably won’t be of use to you. The buff/cache column shows how much space is used for disk caching and related purposes but is actually available. Hence the available column is the sum of the free and buff/cache columns (more or less). In this case only about 1 GB is in use (indicated in the used column).\ntop (Linux or Mac) and vmstat (on Linux) both show overall memory use, but remember that the amount actually available to you is the amount free plus any buff/cache usage. Here is some example output from vmstat:\n\n procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- \n r b swpd free buff cache si so bi bo in cs us sy id wa st \n 1 0 215140 231655120 677944 30660296 0 0 1 2 0 0 18 0 82 0 0\nIt shows 232 GB free and 31 GB used for cache and therefore available, for a total of 263 GB available.\nHere are some example lines from top:\n KiB Mem : 26413715+total, 23180236+free, 999704 used, 31335072 buff/cache \n KiB Swap: 7999484 total, 7784336 free, 215148 used. 25953483+avail Mem\nWe see that this machine has 264 GB RAM (the total column in the Mem row), with 259.5 GB available (232 GB free plus 31 GB buff/cache as seen in the Mem row). (I realize the numbers don’t quite add up for reasons I don’t fully understand, but we probably don’t need to worry about that degree of exactness.) Only 1 GB is in use.\nSwap is essentially the reverse of disk caching. It is disk space that is used for memory when the machine runs out of physical memory. You never want your machine to be using swap for memory because your jobs will slow to a crawl. As seen above, the swap line in both free and top shows 8 GB swap space, with very little in use, as desired.\n\n\nMonitoring memory use in Python\nThere are a number of ways to see how much memory is being used. When Python is actively executing statements, you can use top from the UNIX shell.\nIn Python, we can call out to the system to get the info we want:\n\nimport psutil\n\n# Get memory information\nmemory_info = psutil.Process().memory_info()\n\n# Print the memory usage\nprint(\"Memory usage:\", memory_info.rss/10**6, \" Mb.\")\n\n# Let's turn that into a function for later use:\n\nMemory usage: 470.368256 Mb.\n\ndef mem_used():\n print(\"Memory usage:\", psutil.Process().memory_info().rss/10**6, \" Mb.\")\n\nWe can see the size of an object (in bytes) with sys.getsizeof().\n\nmy_list = [1, 2, 3, 4, 5]\nsys.getsizeof(my_list)\n\n104\n\nx = np.random.normal(size = 10**7) # should use about 80 Mb\nsys.getsizeof(x)\n\n80000112\n\n\nHowever, we need to be careful about objects that refer to other objects:\n\ny = [3, x]\nsys.getsizeof(y) # Whoops!\n\n72\n\n\nHere’s a trick where we serialize the object, as if to export it, and then see how long the binary representation is.\n\nimport pickle\nser_object = pickle.dumps(y)\nsys.getsizeof(ser_object)\n\n80000201\n\n\nThere are also some flags that one can start python with that allow one to see information about memory use and allocation. See man python. You could also look into the memory_profiler or pympler packages." + "text": "Monitoring memory use\n\nMonitoring overall memory use on a UNIX-style computer\nTo understand how much memory is available on your computer, one needs to have a clear understanding of disk caching. The operating system will generally cache files/data in memory when it reads from disk. Then if that information is still in memory the next time it is needed, it will be much faster to access it the second time around than if it had to read the information from disk. While the cached information is using memory, that same memory is immediately available to other processes, so the memory is available even though it is “in use”.\nWe can see this via free -h (the -h is for ‘human-readable’, i.e. show in GB (G)) on Linux machine.\n total used free shared buff/cache available \n Mem: 251G 998M 221G 2.6G 29G 247G \n Swap: 7.6G 210M 7.4G\nYou’ll generally be interested in the Mem row. (See below for some comments on Swap.) The shared column is complicated and probably won’t be of use to you. The buff/cache column shows how much space is used for disk caching and related purposes but is actually available. Hence the available column is the sum of the free and buff/cache columns (more or less). In this case only about 1 GB is in use (indicated in the used column).\ntop (Linux or Mac) and vmstat (on Linux) both show overall memory use, but remember that the amount actually available to you is the amount free plus any buff/cache usage. Here is some example output from vmstat:\n\n procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- \n r b swpd free buff cache si so bi bo in cs us sy id wa st \n 1 0 215140 231655120 677944 30660296 0 0 1 2 0 0 18 0 82 0 0\nIt shows 232 GB free and 31 GB used for cache and therefore available, for a total of 263 GB available.\nHere are some example lines from top:\n KiB Mem : 26413715+total, 23180236+free, 999704 used, 31335072 buff/cache \n KiB Swap: 7999484 total, 7784336 free, 215148 used. 25953483+avail Mem\nWe see that this machine has 264 GB RAM (the total column in the Mem row), with 259.5 GB available (232 GB free plus 31 GB buff/cache as seen in the Mem row). (I realize the numbers don’t quite add up for reasons I don’t fully understand, but we probably don’t need to worry about that degree of exactness.) Only 1 GB is in use.\nSwap is essentially the reverse of disk caching. It is disk space that is used for memory when the machine runs out of physical memory. You never want your machine to be using swap for memory because your jobs will slow to a crawl. As seen above, the swap line in both free and top shows 8 GB swap space, with very little in use, as desired.\n\n\nMonitoring memory use in Python\nThere are a number of ways to see how much memory is being used. When Python is actively executing statements, you can use top from the UNIX shell.\nIn Python, we can call out to the system to get the info we want:\n\nimport psutil\n\n# Get memory information\nmemory_info = psutil.Process().memory_info()\n\n# Print the memory usage\nprint(\"Memory usage:\", memory_info.rss/10**6, \" Mb.\")\n\n# Let's turn that into a function for later use:\n\nMemory usage: 472.645632 Mb.\n\ndef mem_used():\n print(\"Memory usage:\", psutil.Process().memory_info().rss/10**6, \" Mb.\")\n\nWe can see the size of an object (in bytes) with sys.getsizeof().\n\nmy_list = [1, 2, 3, 4, 5]\nsys.getsizeof(my_list)\n\n104\n\nx = np.random.normal(size = 10**7) # should use about 80 Mb\nsys.getsizeof(x)\n\n80000112\n\n\nHowever, we need to be careful about objects that refer to other objects:\n\ny = [3, x]\nsys.getsizeof(y) # Whoops!\n\n72\n\n\nHere’s a trick where we serialize the object, as if to export it, and then see how long the binary representation is.\n\nimport pickle\nser_object = pickle.dumps(y)\nsys.getsizeof(ser_object)\n\n80000201\n\n\nThere are also some flags that one can start python with that allow one to see information about memory use and allocation. See man python. You could also look into the memory_profiler or pympler packages." }, { "objectID": "units/unit5-programming.html#how-memory-is-used-in-python", "href": "units/unit5-programming.html#how-memory-is-used-in-python", "title": "Programming concepts", "section": "How memory is used in Python", - "text": "How memory is used in Python\n\nTwo key tools: id and is\nWe can use the id function to see where in memory an object is stored and is to see if two object are actually the same objects in memory. It’s particularly useful for understanding storage and memory use for complicated data structures. We’ll also see that they can be handy tools for seeing where copies are made and where they are not.\n\nx = np.random.normal(size = 10**7)\nid(x)\n\n140311150938704\n\nsys.getsizeof(x)\n\n80000112\n\ny = x\nid(y)\n\n140311150938704\n\nx is y\n\nTrue\n\nsys.getsizeof(y)\n\n80000112\n\nz = x.copy()\nid(z)\n\n140311153128912\n\nsys.getsizeof(z)\n\n80000112\n\n\n\n\nMemory use in specific circumstances\n\nHow lists are stored\nHere we can use id to determine how the overall list is stored as well as the elements of the list.\n\nnums = np.random.normal(size = 5)\nobj = [nums, nums, np.random.normal(size = 5), ['adfs']]\n\nid(nums)\n\n140311150937936\n\nid(obj)\n\n140311325165888\n\nid(obj[0])\n\n140311150937936\n\nid(obj[1])\n\n140311150937936\n\nid(obj[2])\n\n140311150945712\n\nid(obj[3])\n\n140311151627584\n\nid(obj[3][0])\n\n140311151630320\n\nobj[0] is obj[1]\n\nTrue\n\nobj[0] is obj[2]\n\nFalse\n\n\nWhat do we notice?\n\nThe list itself appears to be a array of pointers to the component elements.\nEach element has its own address.\nTwo elements of a list can use the same memory (see the first two elements here, whose contents are at the same memory address).\nA list element can use the same memory as another object (or part of another object).\n\n\n\nHow character strings are stored.\nSimilar tricks are used for storing strings (and also integers). We’ll explore this in a problem on PS4.\n\n\nModifying elements in place\nWhat do this simple experiment tell us?\n\nx = np.random.normal(size = 5)\nid(x)\n\n140311150946768\n\nx[2] = 3.5\nid(x)\n\n140311150946768\n\n\nIt makes some sense that modifying elements of an object here doesn’t cause a copy – if it did, working with large objects would be very difficult.\n\n\n\nWhen are copies made?\nLet’s try to understand when Python uses additional memory for objects, and how it knows when it can delete memory. We’ll use large objects so that we can use free or top to see how memory use by the Python process changes.\n\nx = np.random.normal(size = 10**8)\nid(x)\n\n140311151333360\n\ny = x\nid(y)\n\n140311151333360\n\nx = np.random.normal(size = 10**8)\nid(x)\n\n140311150946768\n\n\nOnly if we re-assign x to reference a different object does additional memory get used.\n\nHow does Python know when it can free up memory?\nPython keeps track of how many names refer to an object and only removes memory when there are no remaining references to an object.\n\nimport sys\n\nx = np.random.normal(size = 10**8)\ny = x\nsys.getrefcount(y)\n\n3\n\ndel x\nsys.getrefcount(y)\n\n2\n\ndel y\n\nWe can see the number of references using sys.getrefcount. Confusingly, the number is one higher than we’d expect, because it includes the temporary reference from passing the object as the argument to getrefcount.\n\nx = np.random.normal(size = 5)\nsys.getrefcount(x) # 2 \n\n2\n\ny = x\nsys.getrefcount(x) # 3\n\n3\n\nsys.getrefcount(y) # 3\n\n3\n\ndel y\nsys.getrefcount(x) # 2\n\n2\n\ny = x\nx = np.random.normal(size = 5)\nsys.getrefcount(y) # 2\n\n2\n\nsys.getrefcount(x) # 2\n\n2\n\n\nThis notion of reference counting occurs in other contexts, such as shared pointers in C++ and in how R handles copying and garbage collection." + "text": "How memory is used in Python\n\nTwo key tools: id and is\nWe can use the id function to see where in memory an object is stored and is to see if two object are actually the same objects in memory. It’s particularly useful for understanding storage and memory use for complicated data structures. We’ll also see that they can be handy tools for seeing where copies are made and where they are not.\n\nx = np.random.normal(size = 10**7)\nid(x)\n\n140457191377488\n\nsys.getsizeof(x)\n\n80000112\n\ny = x\nid(y)\n\n140457191377488\n\nx is y\n\nTrue\n\nsys.getsizeof(y)\n\n80000112\n\nz = x.copy()\nid(z)\n\n140457193549392\n\nsys.getsizeof(z)\n\n80000112\n\n\n\n\nMemory use in specific circumstances\n\nHow lists are stored\nHere we can use id to determine how the overall list is stored as well as the elements of the list.\n\nnums = np.random.normal(size = 5)\nobj = [nums, nums, np.random.normal(size = 5), ['adfs']]\n\nid(nums)\n\n140457191376720\n\nid(obj)\n\n140457364871680\n\nid(obj[0])\n\n140457191376720\n\nid(obj[1])\n\n140457191376720\n\nid(obj[2])\n\n140457191384496\n\nid(obj[3])\n\n140457192017152\n\nid(obj[3][0])\n\n140457192019632\n\nobj[0] is obj[1]\n\nTrue\n\nobj[0] is obj[2]\n\nFalse\n\n\nWhat do we notice?\n\nThe list itself appears to be a array of references (pointers) to the component elements.\nEach element has its own address.\nTwo elements of a list can use the same memory (see the first two elements here, whose contents are at the same memory address).\nA list element can use the same memory as another object (or part of another object).\n\n\n\nHow character strings are stored.\nSimilar tricks are used for storing strings (and also integers). We’ll explore this in a problem on PS4.\n\n\nModifying elements in place\nWhat do this simple experiment tell us?\n\nx = np.random.normal(size = 5)\nid(x)\n\n140457191385552\n\nx[2] = 3.5\nid(x)\n\n140457191385552\n\n\nIt makes some sense that modifying elements of an object here doesn’t cause a copy – if it did, working with large objects would be very difficult.\n\n\n\nWhen are copies made?\nLet’s try to understand when Python uses additional memory for objects, and how it knows when it can delete memory. We’ll use large objects so that we can use free or top to see how memory use by the Python process changes.\n\nx = np.random.normal(size = 10**8)\nid(x)\n\n140457191755760\n\ny = x\nid(y)\n\n140457191755760\n\nx = np.random.normal(size = 10**8)\nid(x)\n\n140457191385552\n\n\nOnly if we re-assign x to reference a different object does additional memory get used.\n\nHow does Python know when it can free up memory?\nPython keeps track of how many names refer to an object and only removes memory when there are no remaining references to an object.\n\nimport sys\n\nx = np.random.normal(size = 10**8)\ny = x\nsys.getrefcount(y)\n\n3\n\ndel x\nsys.getrefcount(y)\n\n2\n\ndel y\n\nWe can see the number of references using sys.getrefcount. Confusingly, the number is one higher than we’d expect, because it includes the temporary reference from passing the object as the argument to getrefcount.\n\nx = np.random.normal(size = 5)\nsys.getrefcount(x) # 2 \n\n2\n\ny = x\nsys.getrefcount(x) # 3\n\n3\n\nsys.getrefcount(y) # 3\n\n3\n\ndel y\nsys.getrefcount(x) # 2\n\n2\n\ny = x\nx = np.random.normal(size = 5)\nsys.getrefcount(y) # 2\n\n2\n\nsys.getrefcount(x) # 2\n\n2\n\n\nThis notion of reference counting occurs in other contexts, such as shared pointers in C++ and in how R handles copying and garbage collection." }, { "objectID": "units/unit5-programming.html#strategies-for-saving-memory", "href": "units/unit5-programming.html#strategies-for-saving-memory", "title": "Programming concepts", "section": "Strategies for saving memory", - "text": "Strategies for saving memory\nA couple basic strategies for saving memory include:\n\nAvoiding unnecessary copies.\nRemoving objects that are not being used, at which point the Python garbage collector should free up the memory.\n\nIf you’re really trying to optimize memory use, you may also consider:\n\nUsing types that take up less memory (e.g., Bool, Int16, Float32) when possible.\n\nx = np.array(np.random.normal(size = 5), dtype = \"float32\")\nx.itemsize\n\n4\n\nx = np.array([3,4,2,-2], dtype = \"int16\")\nx.itemsize\n\n2" + "text": "Strategies for saving memory\nA frew basic strategies for saving memory include:\n\nAvoiding unnecessary copies.\nRemoving objects that are not being used, at which point the Python garbage collector should free up the memory.\n\nIf you’re really trying to optimize memory use, you may also consider:\n\nUsing types that take up less memory (e.g., Bool, Int16, Float32) when possible.\n\nx = np.array(np.random.normal(size = 5), dtype = \"float32\")\nx.itemsize\n\n4\n\nx = np.array([3,4,2,-2], dtype = \"int16\")\nx.itemsize\n\n2\n\n\nReading data in from files in chunks rather than reading the entire dataset (more in Unit 7).\nExploring packages such as arrow for efficiently using memory, as discussed in Unit 2." }, { "objectID": "units/unit5-programming.html#example", "href": "units/unit5-programming.html#example", "title": "Programming concepts", "section": "Example", - "text": "Example\nLet’s work through a real example where we keep a running tally of current memory in use and maximum memory used in a function call. We’ll want to consider hidden uses of memory, when copies are made, and lazy evaluation. This code (translated from the original R code) is courtesy of Yuval Benjamini. For our purposes here, let’s assume that xvar and yvar are very long vectors using a lot of memory.\n\ndef fastcount(xvar, yvar):\n naline = np.isnan(xvar)\n naline[np.isnan(yvar)] = True\n localx = xvar\n localy = yvar\n localx[naline] = 0\n localy[naline] = 0\n useline = ~naline\n ## We'll ignore the rest of the code.\n ## ...." + "text": "Example\nLet’s work through a real example where we keep a running tally of current memory in use and maximum memory used in a function call. We’ll want to consider hidden uses of memory, when copies are made, and lazy evaluation. This code (translated from the original R code) is courtesy of Yuval Benjamini. For our purposes here, let’s assume that xvar and yvar are very long numpy arrays using a lot of memory.\n\ndef fastcount(xvar, yvar):\n naline = np.isnan(xvar)\n naline[np.isnan(yvar)] = True\n localx = xvar.copy()\n localy = yvar.copy()\n localx[naline] = 0\n localy[naline] = 0\n useline = ~naline\n ## We'll ignore the rest of the code.\n ## ...." }, { "objectID": "units/unit5-programming.html#interpreters-and-compilation", "href": "units/unit5-programming.html#interpreters-and-compilation", "title": "Programming concepts", "section": "Interpreters and compilation", - "text": "Interpreters and compilation\n\nWhy are interpreted languages slow?\nCompiled code runs quickly because the original code has been translated into instructions (machine language) that the processor can understand (i.e., zeros and ones). In the process of doing so, various checking and lookup steps are done once and don’t need to be redone when running the compiled code.\nIn contrast, when one runs code in an interpreted language such as Python or R, the interpreter needs to do all the checking and lookup each time the code is run. This is required because the types and locations in memory of the variables could have changed.\nWe’ll focus on Python in the following discussion, but most of the concepts apply to other interpreted languages.\nFor example, consider this code:\n\nx = 3\nabs(x)\nx*7\nabs(x)\nx = 'hi'\ntry:\n abs(x)\nexcept Exception as error:\n print(error)\nx*3\n\nBecause of dynamic typing, when the interpreter sees abs(x) it needs to check if x is something to on the absolute value function be used, including dealing with the fact that x could be a list or array with many numbers in it. In addition it needs to (using scoping rules) look up the value of x. (Consider that x might not even exist at the point that abs(x) is called.) Only then can the absolute value calculation happen. For the multiplication, Python needs to lookup the version of * that can be used, depending on the type of x.\nLet’s consider writing a loop:\n\nfor i in range(10):\n if np.random.normal(size = 1) > 0:\n x = 'hi'\n if np.random.normal(size = 1) > 0.5:\n del x\n x[i] = np.exp(x[i])\n\nThere is no way around the fact that because of how dynamic this is, the interpreter needs to check if x exists, if it is a vector of sufficient length, if it contains numeric values, and it needs to go retrieve the required value, EVERY TIME the np.exp() is executed. Now the code above is unusual, and in most cases, we wouldn’t have the if statements that modify x. So you could imagine a process by which the checking were done on the first iteration and then not needed after that – that gets into the idea of just-in-time compilation, discussed later.\nThe standard Python interpreter (CPython) is a C function so in some sense everything that happens is running as compiled code, but there are lots more things being done to accomplish a given task using interpreted code than if the task had been written directly in code that is compiled. By analogy, consider talking directly to a person in a language you both know compared to talking to a person via an interpreter who has to translate between two languages. Ultimately, the same information gets communicated (hopefully!) but the number of words spoken and time involved is much greater.\nWhen running more complicated functions, there is often a lot of checking that is part of the function itself. For example scipy’s solve_triangular function ultimately calls out to the trtrs Lapack function, but before doing so, there is a lot of checking that can take time. To that point, the documentation suggests you might set check_finite=False to improve performance at the expense of potential problems if the input matrices contain troublesome elements.\nWe can flip the question on its head and ask what operations in an interpreted language will execute quickly. In Python, these include:\n\noperations that call out to compiled C code,\nlinear algebra operations (these call out to compiled C or Fortran code provided by the BLAS and LAPACK software packages), and\nvectorized calls rather than loops:\n\nvectorized calls generally run loops in compiled C code rather than having the loop run in Python, and\nthat means that the interpreter doesn’t have to do all the checking discussed above for every iteration of the loop.\n\n\n\n\nCompilation\n\nOverview\nCompilation is the process of turning code in a given language (such a C++) into machine code. Machine code is the code that the processor actually executes. The machine code is stored in the executable file, which is a binary file. The history of programming has seen ever great levels of abstraction, so that humans can write code using syntax that is easier for us to understand, re-use, and develop building blocks that can be put together to do complicated tasks. For example assembly language is a step above machine code. Languages like C and Fortran provide additional abstraction beyond that. The Statistics 750 class at CMU has a nice overview if you want to see more details.\nNote that interpreters such as Python are themselves programs – the standard Python interpreter (CPython) is a C program that has been compiled. It happens to be a program that processes Python code. The interpreter doesn’t turn Python code into machine code, but the interpreter itself is machine code.\n\n\nJust-in-time (JIT) compilation\nStandard compilation (ahead-of-time or AOT compilation) happens before any code is executed and can involve a lot of optimization to produce the most efficient machine code possible.\nIn contrast, just-in-time (JIT) compilation happens at the time that the code is executing. JIT compilation is heavily used in Julia, which is very fast (in some cases as fast as C). JIT compilation involves translating to machine code as the code is running. One nice aspect is that the results are cached so that if code is rerun, the compilation process doesn’t have to be redone. So if you use a language like Julia, you’ll see that the speed can vary drastically between the first time and later times you run a given function during a given session.\nOne thing that needs to be dealt with is type checking. As discussed above, part of why an interpreter is slow is because the type of the variable(s) involved in execution of a piece of code is not known in advance, so the interpreter needs to check the type. In JIT systems, there are often type inference systems that determine variable types.\nJIT compilation can involve translation from the original code to machine code or translation of bytecode (see next section) to machine code.\n\n\nByte compiling (optional)\nFunctions in Python and Python packages may byte compiled. What does that mean? Byte-compiled code is a special representation that can be executed more efficiently because it is in the form of compact codes that encode the results of parsing and semantic analysis of scoping and other complexities of the Python source code. This byte code can be executed faster than the original Python code because it skips the stage of having to be interpreted by the Python interpreter.\nIf you look at the file names in the directory of an installed Python package you may see files with the .pyc extension. These files have been byte-compiled.\nWe can byte compile our own functions using either the py_compile or compileall modules. Here’s an example (silly since as experienced Python programmers, we would use vectorized calculation here rather than this unvectorized code.)\n\nimport time\n\ndef f(vals):\n x = np.zeros(len(vals))\n for i in range(len(vals)):\n x[i] = np.exp(vals[i])\n return(x)\n\nx = np.random.normal(size = 10**6)\nt0 = time.time()\nout = f(x)\ntime.time() - t0\n\n0.7586061954498291\n\nt0 = time.time()\nout = np.exp(x)\ntime.time() - t0\n\n0.01432943344116211\n\n\n\nimport py_compile\npy_compile.compile('vec.py')\n\n'__pycache__/vec.cpython-311.pyc'\n\n\n\ncp __pycache__/vec.cpython-311.pyc vec.pyc\nrm vec.py # make sure non-compiled module not loaded\n\n\nimport vec\nvec.__file__\n \n\n'/accounts/vis/paciorek/teaching/243fall23/stat243-fall-2023/units/vec.pyc'\n\nt0 = time.time()\nout = vec.f(x)\ntime.time() - t0\n\n0.9548311233520508\n\n\nUnfortunately, as seen above byte compiling may not speed things up much. I’m not sure why." + "text": "Interpreters and compilation\n\nWhy are interpreted languages slow?\nCompiled code runs quickly because the original code has been translated into instructions (machine language) that the processor can understand (i.e., zeros and ones). In the process of doing so, various checking and lookup steps are done once and don’t need to be redone when running the compiled code.\nIn contrast, when one runs code in an interpreted language such as Python or R, the interpreter needs to do all the checking and lookup each time the code is run. This is required because the types and locations in memory of the variables could have changed.\nWe’ll focus on Python in the following discussion, but most of the concepts apply to other interpreted languages.\nFor example, consider this code:\n\nx = 3\nabs(x)\nx*7\nabs(x)\nx = 'hi'\ntry:\n abs(x)\nexcept Exception as error:\n print(error)\nx*3\n\nBecause of dynamic typing, when the interpreter sees abs(x) it needs to check if x is something to on the absolute value function be used, including dealing with the fact that x could be a list or array with many numbers in it. In addition it needs to (using scoping rules) look up the value of x. (Consider that x might not even exist at the point that abs(x) is called.) Only then can the absolute value calculation happen. For the multiplication, Python needs to lookup the version of * that can be used, depending on the type of x.\nLet’s consider writing a loop:\n\nfor i in range(10):\n if np.random.normal(size = 1) > 0:\n x = 'hi'\n if np.random.normal(size = 1) > 0.5:\n del x\n x[i] = np.exp(x[i])\n\nThere is no way around the fact that because of how dynamic this is, the interpreter needs to check if x exists, if it is a vector of sufficient length, if it contains numeric values, and it needs to go retrieve the required value, EVERY TIME the np.exp() is executed. Now the code above is unusual, and in most cases, we wouldn’t have the if statements that modify x. So you could imagine a process by which the checking were done on the first iteration and then not needed after that – that gets into the idea of just-in-time compilation, discussed later.\nThe standard Python interpreter (CPython) is a C function so in some sense everything that happens is running as compiled code, but there are lots more things being done to accomplish a given task using interpreted code than if the task had been written directly in code that is compiled. By analogy, consider talking directly to a person in a language you both know compared to talking to a person via an interpreter who has to translate between two languages. Ultimately, the same information gets communicated (hopefully!) but the number of words spoken and time involved is much greater.\nWhen running more complicated functions, there is often a lot of checking that is part of the function itself. For example scipy’s solve_triangular function ultimately calls out to the trtrs Lapack function, but before doing so, there is a lot of checking that can take time. To that point, the documentation suggests you might set check_finite=False to improve performance at the expense of potential problems if the input matrices contain troublesome elements.\nWe can flip the question on its head and ask what operations in an interpreted language will execute quickly. In Python, these include:\n\noperations that call out to compiled C code,\nlinear algebra operations (these call out to compiled C or Fortran code provided by the BLAS and LAPACK software packages), and\nvectorized calls rather than loops:\n\nvectorized calls generally run loops in compiled C code rather than having the loop run in Python, and\nthat means that the interpreter doesn’t have to do all the checking discussed above for every iteration of the loop.\n\n\n\n\nCompilation\n\nOverview\nCompilation is the process of turning code in a given language (such a C++) into machine code. Machine code is the code that the processor actually executes. The machine code is stored in the executable file, which is a binary file. The history of programming has seen ever great levels of abstraction, so that humans can write code using syntax that is easier for us to understand, re-use, and develop building blocks that can be put together to do complicated tasks. For example assembly language is a step above machine code. Languages like C and Fortran provide additional abstraction beyond that. The Statistics 750 class at CMU has a nice overview if you want to see more details.\nNote that interpreters such as Python are themselves programs – the standard Python interpreter (CPython) is a C program that has been compiled. It happens to be a program that processes Python code. The interpreter doesn’t turn Python code into machine code, but the interpreter itself is machine code.\n\n\nJust-in-time (JIT) compilation\nStandard compilation (ahead-of-time or AOT compilation) happens before any code is executed and can involve a lot of optimization to produce the most efficient machine code possible.\nIn contrast, just-in-time (JIT) compilation happens at the time that the code is executing. JIT compilation is heavily used in Julia, which is very fast (in some cases as fast as C). JIT compilation involves translating to machine code as the code is running. One nice aspect is that the results are cached so that if code is rerun, the compilation process doesn’t have to be redone. So if you use a language like Julia, you’ll see that the speed can vary drastically between the first time and later times you run a given function during a given session.\nOne thing that needs to be dealt with is type checking. As discussed above, part of why an interpreter is slow is because the type of the variable(s) involved in execution of a piece of code is not known in advance, so the interpreter needs to check the type. In JIT systems, there are often type inference systems that determine variable types.\nJIT compilation can involve translation from the original code to machine code or translation of bytecode (see next section) to machine code.\n\n\nByte compiling (optional)\nFunctions in Python and Python packages may byte compiled. What does that mean? Byte-compiled code is a special representation that can be executed more efficiently because it is in the form of compact codes that encode the results of parsing and semantic analysis of scoping and other complexities of the Python source code. This byte code can be executed faster than the original Python code because it skips the stage of having to be interpreted by the Python interpreter.\nIf you look at the file names in the directory of an installed Python package you may see files with the .pyc extension. These files have been byte-compiled.\nWe can byte compile our own functions using either the py_compile or compileall modules. Here’s an example (silly since as experienced Python programmers, we would use vectorized calculation here rather than this unvectorized code.)\n\nimport time\n\ndef f(vals):\n x = np.zeros(len(vals))\n for i in range(len(vals)):\n x[i] = np.exp(vals[i])\n return(x)\n\nx = np.random.normal(size = 10**6)\nt0 = time.time()\nout = f(x)\ntime.time() - t0\n\n0.8189809322357178\n\nt0 = time.time()\nout = np.exp(x)\ntime.time() - t0\n\n0.012688636779785156\n\n\n\nimport py_compile\npy_compile.compile('vec.py')\n\n'__pycache__/vec.cpython-311.pyc'\n\n\n\ncp __pycache__/vec.cpython-311.pyc vec.pyc\nrm vec.py # make sure non-compiled module not loaded\n\n\nimport vec\nvec.__file__\n \n\n'/accounts/vis/paciorek/teaching/243fall23/stat243-fall-2023/units/vec.pyc'\n\nt0 = time.time()\nout = vec.f(x)\ntime.time() - t0\n\n0.8231360912322998\n\n\nUnfortunately, as seen above byte compiling may not speed things up much. I’m not sure why." }, { "objectID": "units/unit5-programming.html#benchmarking-and-profiling", "href": "units/unit5-programming.html#benchmarking-and-profiling", "title": "Programming concepts", "section": "Benchmarking and profiling", - "text": "Benchmarking and profiling\nRecall that it’s a waste of time to optimize code before you determine (1) that the code is too slow for how it will be used and (2) which are the slow steps on which to focus your attempts to speed the code up. A 100x speedup in a step that takes 1% of the time will speed up the overall code by very little.\n\nTiming your code\nThere are a few ways to time code:\n\nimport time\nt0 = time.time()\nx = 3\nt1 = time.time()\n\nprint(f\"Execution time: {t1-t0} seconds.\")\n\nExecution time: 0.006555795669555664 seconds.\n\n\nIn general, it’s a good idea to repeat (replicate) your timing, as there is some stochasticity in how fast your computer will run a piece of code at any given moment.\nUsing time is fine for code that takes a little while to run, but for code that is really fast, it may not be very accurate. Measuring fast bits of code is tricky to do well. This next approach is better for benchmarking code (particularly faster bits of code).\n\nimport timeit\n\ntimeit.timeit('x = np.exp(3.)', setup = 'import numpy as np', number = 100)\n\n7.695332169532776e-05\n\ncode = '''\nx = np.exp(3.)\n'''\n\ntimeit.timeit(code, setup = 'import numpy as np', number = 100)\n\n7.366575300693512e-05\n\n\nThat reports the total time for the 100 replications.\nWe can run it from the command line.\n\npython -m timeit -s 'import numpy' -n 1000 'x = numpy.exp(3.)'\n\n1000 loops, best of 5: 549 nsec per loop\n\n\ntimeit ran the code 1000 times for 5 different repetitions, giving the average time for the 1000 samples for the best of the 5 repetitions.\n\n\nProfiling\nThe Cprofile module will show you how much time is spent in different functions, which can help you pinpoint bottlenecks in your code.\nI haven’t run this code when producing this document as the output of the profiling can be lengthy.\n\ndef lr_slow(y, x):\n xtx = x.T @ x\n xty = x.T @ y\n inv = np.linalg.inv(xtx)\n return inv @ xty\n\n## generate random observations and random matrix of predictors\ny = np.random.normal(size = 5000)\nx = np.random.normal(size = (5000,1000))\n\nimport cProfile\n\ncProfile.run('lr_slow(y,x)')\n\nThe cumtime column includes the time spent in subfunction calls while the tottime column excludes it.\nAs we’ll discuss in detail in Unit 10, we almost never want to explicitly invert a matrix. Instead we factorize the matrix and use the factorized result to do the computation of interest. In this case using the Cholesky decomposition is a standard approach, followed by solving triangular systems of equations.\n\nimport scipy as sp\n\ndef lr_fast(y, x):\n xtx = x.T @ x\n xty = x.T @ y\n L = sp.linalg.cholesky(xtx)\n out = sp.linalg.solve_triangular(L.T, \n sp.linalg.solve_triangular(L, xty, lower=True),\n lower=False)\n return(out)\n\ncProfile.run('lr_fast(y,x)')\n\nThe Cholesky now dominates the computational time (but is much faster than inv), so there’s not much more we can do in this case.\nYou might wonder if it’s better to use x.T or np.transpose(x). Try using timeit to decide.\nThe Python profilers (cProfile and profile (not shown)) use deterministic profiling – calculating the interval between events (i.e., function calls and returns). However, there is some limit to accuracy – the underlying ‘clock’ measures in units of about 0.001 seconds.\n(In contrast, R’s profiler works by sampling (statistical profiling) - every little while during a calculation it finds out what function R is in and saves that information to a file. So if you try to profile code that finishes really quickly, there’s not enough opportunity for the sampling to represent the calculation accurately and you may get spurious results.)" + "text": "Benchmarking and profiling\nRecall that it’s a waste of time to optimize code before you determine (1) that the code is too slow for how it will be used and (2) which are the slow steps on which to focus your attempts to speed the code up. A 100x speedup in a step that takes 1% of the time will speed up the overall code by very little.\n\nTiming your code\nThere are a few ways to time code:\n\nimport time\nt0 = time.time()\nx = 3\nt1 = time.time()\n\nprint(f\"Execution time: {t1-t0} seconds.\")\n\nExecution time: 0.004503488540649414 seconds.\n\n\nIn general, it’s a good idea to repeat (replicate) your timing, as there is some stochasticity in how fast your computer will run a piece of code at any given moment.\nUsing time is fine for code that takes a little while to run, but for code that is really fast, it may not be very accurate. Measuring fast bits of code is tricky to do well. This next approach is better for benchmarking code (particularly faster bits of code).\n\nimport timeit\n\ntimeit.timeit('x = np.exp(3.)', setup = 'import numpy as np', number = 100)\n\n0.00011431612074375153\n\ncode = '''\nx = np.exp(3.)\n'''\n\ntimeit.timeit(code, setup = 'import numpy as np', number = 100)\n\n7.406435906887054e-05\n\n\nThat reports the total time for the 100 replications.\nWe can run it from the command line.\n\npython -m timeit -s 'import numpy' -n 1000 'x = numpy.exp(3.)'\n\n:0: UserWarning: The test results are likely unreliable. The worst time (2.67 usec) was more than four times slower than the best time (623 nsec).\n1000 loops, best of 5: 623 nsec per loop\n\n\ntimeit ran the code 1000 times for 5 different repetitions, giving the average time for the 1000 samples for the best of the 5 repetitions.\n\n\nProfiling\nThe Cprofile module will show you how much time is spent in different functions, which can help you pinpoint bottlenecks in your code.\nI haven’t run this code when producing this document as the output of the profiling can be lengthy.\n\ndef lr_slow(y, x):\n xtx = x.T @ x\n xty = x.T @ y\n inv = np.linalg.inv(xtx)\n return inv @ xty\n\n## generate random observations and random matrix of predictors\ny = np.random.normal(size = 5000)\nx = np.random.normal(size = (5000,1000))\n\nimport cProfile\n\ncProfile.run('lr_slow(y,x)')\n\nThe cumtime column includes the time spent in subfunction calls while the tottime column excludes it.\nAs we’ll discuss in detail in Unit 10, we almost never want to explicitly invert a matrix. Instead we factorize the matrix and use the factorized result to do the computation of interest. In this case using the Cholesky decomposition is a standard approach, followed by solving triangular systems of equations.\n\nimport scipy as sp\n\ndef lr_fast(y, x):\n xtx = x.T @ x\n xty = x.T @ y\n L = sp.linalg.cholesky(xtx)\n out = sp.linalg.solve_triangular(L.T, \n sp.linalg.solve_triangular(L, xty, lower=True),\n lower=False)\n return(out)\n\ncProfile.run('lr_fast(y,x)')\n\nThe Cholesky now dominates the computational time (but is much faster than inv), so there’s not much more we can do in this case.\nYou might wonder if it’s better to use x.T or np.transpose(x). Try using timeit to decide.\nThe Python profilers (cProfile and profile (not shown)) use deterministic profiling – calculating the interval between events (i.e., function calls and returns). However, there is some limit to accuracy – the underlying ‘clock’ measures in units of about 0.001 seconds.\n(In contrast, R’s profiler works by sampling (statistical profiling) - every little while during a calculation it finds out what function R is in and saves that information to a file. So if you try to profile code that finishes really quickly, there’s not enough opportunity for the sampling to represent the calculation accurately and you may get spurious results.)" }, { "objectID": "units/unit5-programming.html#writing-efficient-python-code", "href": "units/unit5-programming.html#writing-efficient-python-code", "title": "Programming concepts", "section": "Writing efficient Python code", - "text": "Writing efficient Python code\nWe’ll discuss a variety of these strategies, including:\n\nPre-allocating memory rather than growing objects iteratively\nVectorization and use of fast matrix algebra\nConsideration of loops vs. map operations\nSpeed of lookup operations, including hashing\n\n\nPre-allocating memory\nLet’s consider whether we should pre-allocate space for the output of an operation or if it’s ok to keep extending the length of an array or list.\n\nn = 100000\nz = np.random.normal(size = n)\n\n## Pre-allocation\n\ndef fun_prealloc(vals):\n n = len(vals)\n x = [0] * n\n for i in range(n):\n x[i] = np.exp(vals[i])\n return(x)\n\n## Appending to a list\n\ndef fun_append(vals):\n x = []\n for i in range(n):\n x.append(np.exp(vals[i]))\n return(x)\n\n## Appending to a numpy array\n\ndef fun_append_np(vals):\n x = np.array([])\n for i in range(n):\n x = np.append(x, np.exp(vals[i]))\n return(x)\n\n\nt0 = time.time()\nout1 = fun_prealloc(z)\ntime.time() - t0\n\n0.07567524909973145\n\nt0 = time.time()\nout2 = fun_append(z)\ntime.time() - t0\n\n0.07505297660827637\n\nt0 = time.time()\nout3 = fun_append_np(z)\ntime.time() - t0\n\n2.527529239654541\n\n\nSo what’s going on? First let’s consider what is happening with the use of np.append. Note that it is a function, rather than a method, and we need to reassign to x. What must be happening in terms of memory use and copying when we append an element?\n\nx = np.random.normal(size = 5)\nid(x)\n\n140311151335472\n\nid(np.append(x, 3.34))\n\n140311151335568\n\n\nWe can avoid that large cost of copying and memory allocation by pre-allocating space for the entire output array. (This is equivalent to variable initialization in compiled languages.)\nOk, but how is it that we can append to the list at apparently no cost?\nIt’s not magic, just that Python is clever. Let’s get an idea of what is going on:\n\ndef fun_append2(vals):\n n = len(vals)\n x = []\n print(f\"Initial id: {id(x)}\")\n sz = sys.getsizeof(x)\n print(f\"iteration 0: size {sz}\")\n for i in range(n):\n x.append(np.exp(vals[i]))\n if sys.getsizeof(x) != sz:\n sz = sys.getsizeof(x)\n print(f\"iteration {i}: size {sz}\")\n print(f\"Final id: {id(x)}\")\n return(x)\n\nz = np.random.normal(size = 1000)\nout = fun_append2(z)\n\nInitial id: 140311008513600\niteration 0: size 56\niteration 0: size 88\niteration 4: size 120\niteration 8: size 184\niteration 16: size 248\niteration 24: size 312\niteration 32: size 376\niteration 40: size 472\niteration 52: size 568\niteration 64: size 664\niteration 76: size 792\niteration 92: size 920\niteration 108: size 1080\niteration 128: size 1240\niteration 148: size 1432\niteration 172: size 1656\niteration 200: size 1912\niteration 232: size 2200\niteration 268: size 2520\niteration 308: size 2872\niteration 352: size 3256\niteration 400: size 3704\niteration 456: size 4216\niteration 520: size 4792\niteration 592: size 5432\niteration 672: size 6136\niteration 760: size 6936\niteration 860: size 7832\niteration 972: size 8856\nFinal id: 140311008513600\n\n\nSurprisingly, the id of x doesn’t seem to change, even though we are allocating new memory at many of the iterations. I haven’t looked into this fully, but I believe that what is happening is that x is an wrapper object that contains within it a reference to an array of pointers to the list elements. The location of the wrapper object doesn’t change, but the underlying array of pointers is being reallocated.\nSide note: our assessment of size above does not include the actual size of the list elements.\n\nprint(sys.getsizeof(out))\n\n8856\n\nout[2] = np.random.normal(size = 100000)\nprint(sys.getsizeof(out))\n\n8856\n\n\nOne upshot of this is that if you need to grow an object use a Python list. Then once it is complete, you can always convert it to another type, such as a numpy array.\n\n\nVectorization and use of fast matrix algebra\nOne key way to write efficient Python code is to take advantage of numpy’s vectorized operations.\n\nn = 10**6\nz = np.random.normal(size = n)\nt0 = time.time()\nx = np.exp(z)\nprint(time.time() - t0)\n\n0.03270316123962402\n\nx = np.zeros(n) # Leave out pre-allocation timing to focus on computation.\nt0 = time.time()\nfor i in range(n):\n x[i] = np.exp(z[i])\n\n\nprint(time.time() - t0)\n\n0.808849573135376\n\n\nSo what is different in how Python handles the calculations above that explains the huge disparity in efficiency? The vectorized calculation is being done natively in C in a for loop. The explicit Python for loop involves executing the for loop in Python with repeated calls to C code at each iteration. This involves a lot of overhead because of the repeated processing of the Python code inside the loop. For example, in each iteration of the loop, Python is checking the types of the variables because it’s possible that the types might change, as discussed earlier.\nYou can usually get a sense for how quickly a Python call will pass things along to C or Fortran by looking at the body of the relevant function(s) being called.\nUnfortunately seeing the source code in Python often involves going and finding it in a file on disk, whereas in R, printing a function will show its source code. However you can use ?? in IPython to get the code for non-builtin functions. Consider numpy.linspace??.\nHere I found the source code for the scipy triangular_solve function, which calls out to a Fortran function trtrs, found in the LAPACK library.\n\n## On an SCF machine:\ncat /usr/local/linux/mambaforge-3.11/lib/python3.11/site-packages/scipy/linalg/_basic.py\n\nWith a bit more digging around we could verify that trtrs is a LAPACK funcion by doing some grepping:\n./linalg/_basic.py: trtrs, = get_lapack_funcs(('trtrs',), (a1, b1))\nMany numpy and scipy functions allow you to pass in arrays, and operate on those arrays in vectorized fashion. So before writing a for loop, look at the help information on the relevant function(s) to see if they operate in a vectorized fashion. Functions might take arrays for one or more of their arguments.\nOutside of the numerical packages, we often have to manually do the looping:\n\nx = [3.5, 2.7, 4.6]\ntry:\n math.cos(x)\nexcept Exception as error:\n print(error)\n\nmust be real number, not list\n\n[math.cos(val) for val in x]\n\n[-0.9364566872907963, -0.9040721420170612, -0.11215252693505487]\n\nlist(map(math.cos, x))\n\n[-0.9364566872907963, -0.9040721420170612, -0.11215252693505487]\n\n\nChallenge: Consider the chi-squared statistic involved in a test of independence in a contingency table:\n\\[\n\\chi^{2}=\\sum_{i}\\sum_{j}\\frac{(y_{ij}-e_{ij})^{2}}{e_{ij}},\\,\\,\\,\\, e_{ij}=\\frac{y_{i\\cdot}y_{\\cdot j}}{y_{\\cdot\\cdot}}\n\\]\nwhere \\(y_{i\\cdot}=\\sum_{j}y_{ij}\\) and \\(y_{\\cdot j} = \\sum_{i} y_{ij}\\) and \\(y_{\\cdot\\cdot} = \\sum_{i} \\sum_{j} y_{ij}\\). Write this in a vectorized way without any loops. Note that ‘vectorized’ calculations also work with matrices and arrays.\nSometimes we can exploit vectorized mathematical operations in surprising ways, though sometimes the code is uglier.\n\nx = np.random.normal(size = n)\n\n## List comprehension\ntimeit.timeit('truncx = [max(0,val) for val in x]', number = 10, globals = {'x':x})\n\n1.7915509291924536\n\n\n\n## Vectorized slice replacement\ntimeit.timeit('truncx = x; truncx[x < 0] = 0', number = 10, globals = {'x':x})\n\n0.009905248880386353\n\n\n\n## Vectorized math trick\ntimeit.timeit('truncx = x * x>0', number = 10, globals = {'x':x})\n\n0.017728352919220924\n\n\nWe’ll discuss what has to happen (in terms of calculations, memory allocation, and copying) in the two vectorized approaches to try to understand which is more efficient.\nAdditional tips:\n\nIf you do need to loop over dimensions of a matrix or array, if possible loop over the smallest dimension and use the vectorized calculation on the larger dimension(s). For example if you have a 10000 by 10 matrix, try to set up your problem so you can loop over the 10 columns rather than the 10000 rows.\nIn general, in Python looping over rows is likely to be faster than looping over columns because of numpy’s row-major ordering (by default, matrices are stored in memory as a long array in which values in a row are adjacent to each other). However how numpy handles this is more complicated (see more in Section 4.6 on the cache), such that it may not matter for numpy calculations.\nYou can use direct arithmetic operations to add/subtract/multiply/divide a vector by each column of a matrix, e.g. A*b does element-wise multiplication of each column of A by a vector b. If you need to operate by row, you can do it by transposing the matrix.\n\nCaution: relying on Python’s broadcasting rule in the context of vectorized operations, such as is done when direct-multiplying a matrix by a vector to scale the columns relative to each other, can be dangerous as the code may not be easy for someone to read and poses greater dangers of bugs. In some cases you may want to first write the code more directly and then compare the more efficient code to make sure the results are the same. It’s also a good idea to comment your code in such cases.\n\n\nAre map operations faster than loops?\nThe potential for inefficiency of looping and map operations in interpreted languages will depend in part on whether a substantial part of the work is in the overhead involved in the looping or in the time required by the function evaluation on each of the elements. If you’re worried about speed, it’s a good idea to benchmark map or pandas.apply against looping.\nHere’s an example where the bulk of time is in the actual computation and not in the looping itself. Here map is not faster than a loop.\n\nimport time\nimport statsmodels.api as sm\n\nn = 500000;\nnr = 10000\nnCalcs = int(n/nr)\n\nmat = np.random.normal(size = (nr, nCalcs))\n\nX = list(range(nr))\nX = sm.add_constant(X)\n\ndef regrFun(i):\n model = sm.OLS(mat[:,i], X)\n return(model.fit().params[1])\n\nt0 = time.time()\nout1 = list(map(regrFun, range(nCalcs)))\ntime.time() - t0\n\n0.06007575988769531\n\nt0 = time.time()\nout2 = np.zeros(nCalcs)\nfor i in range(nCalcs):\n out2[i] = regrFun(i)\n\n\ntime.time() - t0\n\n0.06388640403747559\n\n\nAnd here’s an example, where (unlike the previous example) the core computation is very fast, so we might expect the overhead of looping (in its various forms seen here) to be important.\n\nimport time\nn = 10**6\nx = np.random.normal(size = n)\n\nt0 = time.time()\nout = np.exp(x)\ntime.time() - t0\n\n0.014000177383422852\n\nt0 = time.time()\nvals = np.zeros(n)\nfor i in range(n):\n vals[i] = np.exp(x[i])\n\n\ntime.time() - t0\n\n0.842008113861084\n\nt0 = time.time()\nvals = [np.exp(v) for v in x]\ntime.time() - t0\n\n0.7019450664520264\n\nt0 = time.time()\nvals = list(map(np.exp, x))\ntime.time() - t0\n\n0.6085102558135986\n\n\nIn fact, it looks like we can’t avoid the overhead unless we use the vectorized call, which is of course the recommended approach in this case, both for speed and readability (and conciseness).\n\n\nMatrix algebra efficiency\nOften calculations that are not explicitly linear algebra calculations can be done as matrix algebra. If our Python installation has a fast (and possibly parallelized) BLAS, this allows our calculation to take advantage of it.\nFor example, we can sum the rows of a matrix by multiplying by a vector of ones.\n\nmat = np.random.normal(size=(500,500))\n\ntimeit.timeit('mat.dot(np.ones(500))', setup = 'import numpy as np', number = 1000, globals = {'mat': mat})\n\n0.0460930522531271\n\ntimeit.timeit('np.sum(mat, axis = 1)', setup = 'import numpy as np', number = 1000, globals = {'mat': mat})\n\n0.12684659287333488\n\n\nGiven the extra computation involved in actually multiplying each number by one, it’s surprising that this is faster than numpy sum function. One thing we’d want to know is whether the BLAS matrix multiplication call is being done in parallel.\nOn the other hand, big matrix operations can be slow.\nChallenge: Suppose you want a new matrix that computes the differences between successive columns of a matrix of arbitrary size. How would you do this as matrix algebra operations? It’s possible to write it as multiplying the matrix by another matrix that contains 0s, 1s, and -1s in appropriate places. Here it turns out that the for loop is much faster than matrix multiplication. However, there is a way to do it faster as matrix direct subtraction.\n\n\nOrder of operations and efficiency\nWhen doing matrix algebra, the order in which you do operations can be critical for efficiency. How should I order the following calculation?\n\nn = 5000\nA = np.random.normal(size=(n, n))\nB = np.random.normal(size=(n, n))\nx = np.random.normal(size=n)\n\nt0 = time.time()\nres1 = (A @ B) @ x\nprint(time.time() - t0)\n\n1.7937490940093994\n\nt0 = time.time()\nres1 = A @ (B @ x)\nprint(time.time() - t0)\n\n0.08377599716186523\n\n\nWhy is the second order much faster?\n\n\nAvoiding unnecessary operations\nWe can use the matrix direct product (i.e., A*B) to do some manipulations much more quickly than using matrix multiplication. Challenge: How can I use the direct product to find the trace of a matrix, \\(XY\\)?\nFinally, when working with diagonal matrices, you can generally get much faster results by being smart. The following operations: \\(X+D\\), \\(DX\\), \\(XD\\) are mathematically the sum of two matrices and products of two matrices. But we can do the computation without using two full matrices. Challenge: How?\n\nn = 1000\nX = np.random.normal(size=(n, n))\ndiagvals = np.random.normal(size=n)\nD = np.diag(diagvals)\n\n# The following lines are very inefficient\nsummedMat = X + D\nprodMat1 = D @ X\nprodMat2 = X @ D\n\nMore generally, sparse matrices and structured matrices (such as block diagonal matrices) can generally be worked with MUCH more efficiently than treating them as arbitrary matrices. The scipy.sparse package (for both structured and arbitrary sparse matrices) can help, as can specialized code available in other languages, such as C and Fortran packages.\n\n\nSpeed of lookup operations\nThere are lots of situations in which we need to retrieve values for subsequent computations. In some cases we might be retrieving elements of an array or looking up values in a dictionary.\nLet’s compare the speed of some different approaches to lookup.\n\nn = 1000\nx = list(np.random.normal(size = n))\nkeys = [str(v) for v in range(n)]\nxD = dict(zip(keys, x))\n\ntimeit.timeit(\"x[500]\", number = 10**6, globals = {'x':x})\n\n0.015508504584431648\n\ntimeit.timeit(\"xD['500']\", number=10**6, globals = {'xD':xD})\n\n0.029696810990571976\n\n\nHow is it that Python can look up by key in the dictionary at essentially the same speed as jumping to an index position? It uses hashing, which allows O(1) lookup. In contrast, if one has to look through each key in turn, that is O(n), which is much slower:\n\ntimeit.timeit(\"x[keys.index('500')]\", number = 10**6, globals = {'x':x, 'keys':keys})\n\n6.116314208135009\n\n\nAs a further point of contrast, if we look up elements by name in R in named vectors or lists, that is much slower than looking up by index, because R doesn’t use hashing in that context and has to scan through the objects one by one until it finds the one with the name it is looking for. This stands in contrast to R and Python being able to directly go to the position of interest based on the index of an array, or to the hash-based lookup in a Python dictionary or an R environment." + "text": "Writing efficient Python code\nWe’ll discuss a variety of these strategies, including:\n\nPre-allocating memory rather than growing objects iteratively\nVectorization and use of fast matrix algebra\nConsideration of loops vs. map operations\nSpeed of lookup operations, including hashing\n\n\nPre-allocating memory\nLet’s consider whether we should pre-allocate space for the output of an operation or if it’s ok to keep extending the length of an array or list.\n\nn = 100000\nz = np.random.normal(size = n)\n\n## Pre-allocation\n\ndef fun_prealloc(vals):\n n = len(vals)\n x = [0] * n\n for i in range(n):\n x[i] = np.exp(vals[i])\n return(x)\n\n## Appending to a list\n\ndef fun_append(vals):\n x = []\n for i in range(n):\n x.append(np.exp(vals[i]))\n return(x)\n\n## Appending to a numpy array\n\ndef fun_append_np(vals):\n x = np.array([])\n for i in range(n):\n x = np.append(x, np.exp(vals[i]))\n return(x)\n\n\nt0 = time.time()\nout1 = fun_prealloc(z)\ntime.time() - t0\n\n0.07958865165710449\n\nt0 = time.time()\nout2 = fun_append(z)\ntime.time() - t0\n\n0.07781720161437988\n\nt0 = time.time()\nout3 = fun_append_np(z)\ntime.time() - t0\n\n2.6464624404907227\n\n\nSo what’s going on? First let’s consider what is happening with the use of np.append. Note that it is a function, rather than a method, and we need to reassign to x. What must be happening in terms of memory use and copying when we append an element?\n\nx = np.random.normal(size = 5)\nid(x)\n\n140457191757872\n\nid(np.append(x, 3.34))\n\n140457191757968\n\n\nWe can avoid that large cost of copying and memory allocation by pre-allocating space for the entire output array. (This is equivalent to variable initialization in compiled languages.)\nOk, but how is it that we can append to the list at apparently no cost?\nIt’s not magic, just that Python is clever. Let’s get an idea of what is going on:\n\ndef fun_append2(vals):\n n = len(vals)\n x = []\n print(f\"Initial id: {id(x)}\")\n sz = sys.getsizeof(x)\n print(f\"iteration 0: size {sz}\")\n for i in range(n):\n x.append(np.exp(vals[i]))\n if sys.getsizeof(x) != sz:\n sz = sys.getsizeof(x)\n print(f\"iteration {i}: size {sz}\")\n print(f\"Final id: {id(x)}\")\n return(x)\n\nz = np.random.normal(size = 1000)\nout = fun_append2(z)\n\nInitial id: 140457183986560\niteration 0: size 56\niteration 0: size 88\niteration 4: size 120\niteration 8: size 184\niteration 16: size 248\niteration 24: size 312\niteration 32: size 376\niteration 40: size 472\niteration 52: size 568\niteration 64: size 664\niteration 76: size 792\niteration 92: size 920\niteration 108: size 1080\niteration 128: size 1240\niteration 148: size 1432\niteration 172: size 1656\niteration 200: size 1912\niteration 232: size 2200\niteration 268: size 2520\niteration 308: size 2872\niteration 352: size 3256\niteration 400: size 3704\niteration 456: size 4216\niteration 520: size 4792\niteration 592: size 5432\niteration 672: size 6136\niteration 760: size 6936\niteration 860: size 7832\niteration 972: size 8856\nFinal id: 140457183986560\n\n\nSurprisingly, the id of x doesn’t seem to change, even though we are allocating new memory at many of the iterations. I haven’t looked into this fully, but I believe that what is happening is that x is an wrapper object that contains within it a reference to an array of pointers to the list elements. The location of the wrapper object doesn’t change, but the underlying array of pointers is being reallocated.\nSide note: our assessment of size above does not include the actual size of the list elements.\n\nprint(sys.getsizeof(out))\n\n8856\n\nout[2] = np.random.normal(size = 100000)\nprint(sys.getsizeof(out))\n\n8856\n\n\nOne upshot of this is that if you need to grow an object use a Python list. Then once it is complete, you can always convert it to another type, such as a numpy array.\n\n\nVectorization and use of fast matrix algebra\nOne key way to write efficient Python code is to take advantage of numpy’s vectorized operations.\n\nn = 10**6\nz = np.random.normal(size = n)\nt0 = time.time()\nx = np.exp(z)\nprint(time.time() - t0)\n\n0.03270316123962402\n\nx = np.zeros(n) # Leave out pre-allocation timing to focus on computation.\nt0 = time.time()\nfor i in range(n):\n x[i] = np.exp(z[i])\n\n\nprint(time.time() - t0)\n\n0.808849573135376\n\n\nSo what is different in how Python handles the calculations above that explains the huge disparity in efficiency? The vectorized calculation is being done natively in C in a for loop. The explicit Python for loop involves executing the for loop in Python with repeated calls to C code at each iteration. This involves a lot of overhead because of the repeated processing of the Python code inside the loop. For example, in each iteration of the loop, Python is checking the types of the variables because it’s possible that the types might change, as discussed earlier.\nYou can usually get a sense for how quickly a Python call will pass things along to C or Fortran by looking at the body of the relevant function(s) being called.\nUnfortunately seeing the source code in Python often involves going and finding it in a file on disk, whereas in R, printing a function will show its source code. However you can use ?? in IPython to get the code for non-builtin functions. Consider numpy.linspace??.\nHere I found the source code for the scipy triangular_solve function, which calls out to a Fortran function trtrs, found in the LAPACK library.\n\n## On an SCF machine:\ncat /usr/local/linux/mambaforge-3.11/lib/python3.11/site-packages/scipy/linalg/_basic.py\n\nWith a bit more digging around we could verify that trtrs is a LAPACK funcion by doing some grepping:\n./linalg/_basic.py: trtrs, = get_lapack_funcs(('trtrs',), (a1, b1))\nMany numpy and scipy functions allow you to pass in arrays, and operate on those arrays in vectorized fashion. So before writing a for loop, look at the help information on the relevant function(s) to see if they operate in a vectorized fashion. Functions might take arrays for one or more of their arguments.\nOutside of the numerical packages, we often have to manually do the looping:\n\nx = [3.5, 2.7, 4.6]\ntry:\n math.cos(x)\nexcept Exception as error:\n print(error)\n\nmust be real number, not list\n\n[math.cos(val) for val in x]\n\n[-0.9364566872907963, -0.9040721420170612, -0.11215252693505487]\n\nlist(map(math.cos, x))\n\n[-0.9364566872907963, -0.9040721420170612, -0.11215252693505487]\n\n\nChallenge: Consider the chi-squared statistic involved in a test of independence in a contingency table:\n\\[\n\\chi^{2}=\\sum_{i}\\sum_{j}\\frac{(y_{ij}-e_{ij})^{2}}{e_{ij}},\\,\\,\\,\\, e_{ij}=\\frac{y_{i\\cdot}y_{\\cdot j}}{y_{\\cdot\\cdot}}\n\\]\nwhere \\(y_{i\\cdot}=\\sum_{j}y_{ij}\\) and \\(y_{\\cdot j} = \\sum_{i} y_{ij}\\) and \\(y_{\\cdot\\cdot} = \\sum_{i} \\sum_{j} y_{ij}\\). Write this in a vectorized way without any loops. Note that ‘vectorized’ calculations also work with matrices and arrays.\nSometimes we can exploit vectorized mathematical operations in surprising ways, though sometimes the code is uglier.\n\nx = np.random.normal(size = n)\n\n## List comprehension\ntimeit.timeit('truncx = [max(0,val) for val in x]', number = 10, globals = {'x':x})\n\n1.7915509291924536\n\n\n\n## Vectorized slice replacement\ntimeit.timeit('truncx = x; truncx[x < 0] = 0', number = 10, globals = {'x':x})\n\n0.009905248880386353\n\n\n\n## Vectorized math trick\ntimeit.timeit('truncx = x * x>0', number = 10, globals = {'x':x})\n\n0.017728352919220924\n\n\nWe’ll discuss what has to happen (in terms of calculations, memory allocation, and copying) in the two vectorized approaches to try to understand which is more efficient.\nAdditional tips:\n\nIf you do need to loop over dimensions of a matrix or array, if possible loop over the smallest dimension and use the vectorized calculation on the larger dimension(s). For example if you have a 10000 by 10 matrix, try to set up your problem so you can loop over the 10 columns rather than the 10000 rows.\nIn general, in Python looping over rows is likely to be faster than looping over columns because of numpy’s row-major ordering (by default, matrices are stored in memory as a long array in which values in a row are adjacent to each other). However how numpy handles this is more complicated (see more in the Section on cache-aware programming), such that it may not matter for numpy calculations.\nYou can use direct arithmetic operations to add/subtract/multiply/divide a vector by each column of a matrix, e.g. A*b does element-wise multiplication of each column of A by a vector b. If you need to operate by row, you can do it by transposing the matrix.\n\nCaution: relying on Python’s broadcasting rule in the context of vectorized operations, such as is done when direct-multiplying a matrix by a vector to scale the columns relative to each other, can be dangerous as the code may not be easy for someone to read and poses greater dangers of bugs. In some cases you may want to first write the code more directly and then compare the more efficient code to make sure the results are the same. It’s also a good idea to comment your code in such cases.\n\n\nAre map operations faster than loops?\nThe potential for inefficiency of looping and map operations in interpreted languages will depend in part on whether a substantial part of the work is in the overhead involved in the looping or in the time required by the function evaluation on each of the elements. If you’re worried about speed, it’s a good idea to benchmark map or pandas.apply against looping.\nHere’s an example where the bulk of time is in the actual computation and not in the looping itself. Here map is not faster than a loop.\n\nimport time\nimport statsmodels.api as sm\n\nn = 500000;\nnr = 10000\nnCalcs = int(n/nr)\n\nmat = np.random.normal(size = (nr, nCalcs))\n\nX = list(range(nr))\nX = sm.add_constant(X)\n\ndef regrFun(i):\n model = sm.OLS(mat[:,i], X)\n return(model.fit().params[1])\n\nt0 = time.time()\nout1 = list(map(regrFun, range(nCalcs)))\ntime.time() - t0\n\n0.06300592422485352\n\nt0 = time.time()\nout2 = np.zeros(nCalcs)\nfor i in range(nCalcs):\n out2[i] = regrFun(i)\n\n\ntime.time() - t0\n\n0.06177210807800293\n\n\nAnd here’s an example, where (unlike the previous example) the core computation is very fast, so we might expect the overhead of looping (in its various forms seen here) to be important.\n\nimport time\nn = 10**6\nx = np.random.normal(size = n)\n\nt0 = time.time()\nout = np.exp(x)\ntime.time() - t0\n\n0.013067245483398438\n\nt0 = time.time()\nvals = np.zeros(n)\nfor i in range(n):\n vals[i] = np.exp(x[i])\n\n\ntime.time() - t0\n\n0.8266475200653076\n\nt0 = time.time()\nvals = [np.exp(v) for v in x]\ntime.time() - t0\n\n0.6520075798034668\n\nt0 = time.time()\nvals = list(map(np.exp, x))\ntime.time() - t0\n\n0.5861358642578125\n\n\nIn fact, it looks like we can’t avoid the overhead unless we use the vectorized call, which is of course the recommended approach in this case, both for speed and readability (and conciseness).\n\n\nMatrix algebra efficiency\nOften calculations that are not explicitly linear algebra calculations can be done as matrix algebra. If our Python installation has a fast (and possibly parallelized) BLAS, this allows our calculation to take advantage of it.\nFor example, we can sum the rows of a matrix by multiplying by a vector of ones.\n\nmat = np.random.normal(size=(500,500))\n\ntimeit.timeit('mat.dot(np.ones(500))', setup = 'import numpy as np', number = 1000, globals = {'mat': mat})\n\n0.03959464281797409\n\ntimeit.timeit('np.sum(mat, axis = 1)', setup = 'import numpy as np', number = 1000, globals = {'mat': mat})\n\n0.12120177410542965\n\n\nGiven the extra computation involved in actually multiplying each number by one, it’s surprising that this is faster than numpy sum function. One thing we’d want to know is whether the BLAS matrix multiplication call is being done in parallel.\nOn the other hand, big matrix operations can be slow.\nChallenge: Suppose you want a new matrix that computes the differences between successive columns of a matrix of arbitrary size. How would you do this as matrix algebra operations? It’s possible to write it as multiplying the matrix by another matrix that contains 0s, 1s, and -1s in appropriate places. Here it turns out that the for loop is much faster than matrix multiplication. However, there is a way to do it faster as matrix direct subtraction.\n\n\nOrder of operations and efficiency\nWhen doing matrix algebra, the order in which you do operations can be critical for efficiency. How should I order the following calculation?\n\nn = 5000\nA = np.random.normal(size=(n, n))\nB = np.random.normal(size=(n, n))\nx = np.random.normal(size=n)\n\nt0 = time.time()\nres1 = (A @ B) @ x\nprint(time.time() - t0)\n\n1.7937490940093994\n\nt0 = time.time()\nres1 = A @ (B @ x)\nprint(time.time() - t0)\n\n0.08377599716186523\n\n\nWhy is the second order much faster?\n\n\nAvoiding unnecessary operations\nWe can use the matrix direct product (i.e., A*B) to do some manipulations much more quickly than using matrix multiplication. Challenge: How can I use the direct product to find the trace of a matrix, \\(XY\\)?\nFinally, when working with diagonal matrices, you can generally get much faster results by being smart. The following operations: \\(X+D\\), \\(DX\\), \\(XD\\) are mathematically the sum of two matrices and products of two matrices. But we can do the computation without using two full matrices. Challenge: How?\n\nn = 1000\nX = np.random.normal(size=(n, n))\ndiagvals = np.random.normal(size=n)\nD = np.diag(diagvals)\n\n# The following lines are very inefficient\nsummedMat = X + D\nprodMat1 = D @ X\nprodMat2 = X @ D\n\nMore generally, sparse matrices and structured matrices (such as block diagonal matrices) can generally be worked with MUCH more efficiently than treating them as arbitrary matrices. The scipy.sparse package (for both structured and arbitrary sparse matrices) can help, as can specialized code available in other languages, such as C and Fortran packages.\n\n\nSpeed of lookup operations\nThere are lots of situations in which we need to retrieve values for subsequent computations. In some cases we might be retrieving elements of an array or looking up values in a dictionary.\nLet’s compare the speed of some different approaches to lookup.\n\nn = 1000\nx = list(np.random.normal(size = n))\nkeys = [str(v) for v in range(n)]\nxD = dict(zip(keys, x))\n\ntimeit.timeit(\"x[500]\", number = 10**6, globals = {'x':x})\n\n0.016299044713377953\n\ntimeit.timeit(\"xD['500']\", number=10**6, globals = {'xD':xD})\n\n0.030375661328434944\n\n\nHow is it that Python can look up by key in the dictionary at essentially the same speed as jumping to an index position? It uses hashing, which allows O(1) lookup. In contrast, if one has to look through each key in turn, that is O(n), which is much slower:\n\ntimeit.timeit(\"x[keys.index('500')]\", number = 10**6, globals = {'x':x, 'keys':keys})\n\n6.1827212162315845\n\n\nAs a further point of contrast, if we look up elements by name in R in named vectors or lists, that is much slower than looking up by index, because R doesn’t use hashing in that context and has to scan through the objects one by one until it finds the one with the name it is looking for. This stands in contrast to R and Python being able to directly go to the position of interest based on the index of an array, or to the hash-based lookup in a Python dictionary or an R environment." }, { "objectID": "units/unit5-programming.html#additional-general-strategies-for-efficiency", "href": "units/unit5-programming.html#additional-general-strategies-for-efficiency", "title": "Programming concepts", "section": "Additional general strategies for efficiency", - "text": "Additional general strategies for efficiency\nIt’s also useful to be aware of some other strategies for improving efficiency.\n\nCache-aware programming\nIn addition to main memory (what we usually mean when we talk about RAM), computers also have memory caches, which are small amounts of fast memory that can be accessed very quickly by the processor. For example your computer might have L1, L2, and L3 caches, with L1 the smallest and fastest and L3 the largest and slowest. The idea is to try to have the data that is most used by the processor in the cache.\nIf the next piece of data needed for computation is available in the cache, this is a cache hit and the data can be accessed very quickly. However, if the data is not available in the cache, this is a cache miss and the speed of access will be a lot slower. Cache-aware programming involves writing your code to minimize cache misses. Generally when data is read from memory it will be read in chunks, so values that are contiguous will be read together.\nHow does this inform one’s programming? For example, if you have a matrix of values stored in row-major order, computing on a row will be a lot faster than computing on a column, because the row can be read into the cache from main memory and then accessed in the cache. In contrast, if the matrix is large and therefore won’t fit in the cache, when you access the values of a column, you’ll have to go to main memory repeatedly to get the values for the row because the values are not stored contiguously.\nThere’s a nice example of the importance of the cache at the bottom of this blog post.\nIf you know the size of the cache, you can try to design your code so that in a given part of your code you access data structures that will fit in the cache. This sort of thing is generally more relevant if you’re coding in a language like C. But it can matter sometimes in interpreted languages too.\nLet’s see what happens in Python.\n\nnr = 800000\nnc = 100\n\nA = np.random.normal(size=(nr, nc))\ntA = np.random.normal(size=(nc, nr)) \n\n## Verify that A is row-major using `.flags` (notice the `C_CONTIGUOUS: True`).\nA.flags\n\n C_CONTIGUOUS : True\n F_CONTIGUOUS : False\n OWNDATA : True\n WRITEABLE : True\n ALIGNED : True\n WRITEBACKIFCOPY : False\n\n\nNote that I didn’t use A.T or np.transpose as that doesn’t make a copy in memory and so the transposed matrix doesn’t end up being row-major. You can use A.flags and A.T.flags` to see this.\nNow let’s time things.\n\n# Define the mean calculations as functions\ndef mean_by_column():\n return np.mean(A, axis=0)\n\ndef mean_by_row():\n return np.mean(tA, axis=1)\n\ntimeit.timeit(mean_by_column, number=10)\n\n0.5135188214480877\n\ntimeit.timeit(mean_by_row, number=10)\n\n0.4520120248198509\n\n\nSuppose we instead do the looping manually.\n\ntimeit.timeit('[np.mean(A[:,col]) for col in range(A.shape[1])]',\n setup = 'import numpy as np', number=10, globals = {'A': A}) # 5.10 sec.\n\n5.4370351154357195\n\ntimeit.timeit('[np.mean(tA[row,:]) for row in range(tA.shape[0])]',\n setup = 'import numpy as np', number=10, globals = {'tA': tA}) # 5.06 sec.\n\n0.4240651857107878\n\n\nIndeed, the row-wise calculations are faster, particularly if we do it manually. So that suggests numpy might be doing something clever in its implementation of mean with the axis argument.\n\nChallenge: suppose you were writing code for this kind of use case. How could you set up your calculations to do either row-wise or column-wise operations in a way that processes each number sequentially based on the order in which the numbers are stored. For example suppose the values are stored row-major but you want the column sums.\n\nWhen we define a numpy array, we can choose to use column-major order (i.e., “Fortran” order) with the order argument.\n\n\nLoop fusion\nLet’s consider this (vectorized) code:\n\nx = np.exp(x) + 3*np.sin(x)\n\nThis code has some downsides.\n\nThink about whether any additional memory has to be allocated.\nThink about how many for loops will have to get executed.\n\nContrast that to running directly as a for loop (e.g., here in Julia or in C/C++):\n\nfor i in 1:length(x)\n x[i] = exp(x[i]) + 3*sin(x[i])\nend\n\nHow does that affect the downsides mentioned above?\nCombining loops is called ‘fusing’ and is an important optimization that Julia can do, as shown in this demo. It’s also a key optimization done by XLA, a compiler used with Tensorflow, so one approach to getting loop fusion in Python is to use Tensorflow for such calculations within Python rather than simply using numpy.\n\n\nHashing (including name lookup)\nAbove I mentioned that Python uses hashing to store and lookup values by key in a dictionary. I’ll briefly describe what hashing is here, because it is a commonly-used strategy in programming in general.\nA hash function is a function that takes as input some data and maps it to a fixed-length output that can be used as a shortened reference to the data. (The function should be deterministic, always returing the same output for a given input.) We’ve seen this in the context of git commits where each commit was labeled with a long base-16 number. This also comes up when verifying files on the Internet. You can compute the hash value on the file you get and check that it is the same as the hash value associated with the legitimate copy of the file.\nWhile there are various uses of hashing, for our purposes here, hashing can allow one to look up values by their name via a hash table. The idea is that you have a set of key-value pairs (sometimes called a dictionary) where the key is the name associated with the value and the value is some arbitrary object. You want to be able to quickly find the value/object quickly.\nHashing allows one to quickly determine an index associated with the key and therefore quickly find the relevant value based on the index. For example, one approach is to compute the hash as a function of the key and then take the remainder when dividing by the number of possible results (here the fact that the result is a fixed-length output is important) to get the index. Here’s the procedure in pseudocode:\n hash = hashfunc(key) \n index = hash %% array_size \n ## %% is modulo operator - it gives the remainder\nIn general, there will be collisions – multiple keys will be assigned to the same index. However with a good hash function, usually there will be a small number of keys associated with a given bucket. So each bucket will contain a list of a small number of values and the associated keys. (The buckets might contain the actual values or they might contain the addresses of where the values are actually stored if the values are complicated objects.) Then determining the correct value (or the required address) within a given bucket is fast even with simple linear search through the items one by one. Put another way, the hash function distributes the keys amongst an array of buckets and allows one to look up the appropriate bucket quickly based on the computed index value. When the hash table is properly set up, the cost of looking up a value does not depend on the number of key-value pairs stored.\nPython uses hashing to look up the value based on the key in a given dictionary, and similarly when looking up variables in namespaces. This allows Python to retrieve objects very quickly.\n\n\nLazy evaluation\nWhat’s strange about this R code?\n\nf <- function(x) print(\"hi\")\nsystem.time(mean(rnorm(1000000)))\n\n user system elapsed \n 0.057 0.000 0.057 \n\nsystem.time(f(3))\n\n[1] \"hi\"\n\n\n user system elapsed \n 0 0 0 \n\nsystem.time(f(mean(rnorm(1000000)))) \n\n[1] \"hi\"\n\n\n user system elapsed \n 0.001 0.000 0.001 \n\n\nLazy evaluation is not just an R thing. It also occurs in Tensorflow (particularly version 1), the Python Dask package, and in Spark. The basic idea is to delay executation until it’s really needed, with the goal that if one does so, the system may be able to better optimize a series of multiple steps as a joint operation relative to executing them one by one.\nHowever, Python itself does not have lazy evaluation." + "text": "Additional general strategies for efficiency\nIt’s also useful to be aware of some other strategies for improving efficiency.\n\nCache-aware programming\nIn addition to main memory (what we usually mean when we talk about RAM), computers also have memory caches, which are small amounts of fast memory that can be accessed very quickly by the processor. For example your computer might have L1, L2, and L3 caches, with L1 the smallest and fastest and L3 the largest and slowest. The idea is to try to have the data that is most used by the processor in the cache.\nIf the next piece of data needed for computation is available in the cache, this is a cache hit and the data can be accessed very quickly. However, if the data is not available in the cache, this is a cache miss and the speed of access will be a lot slower. Cache-aware programming involves writing your code to minimize cache misses. Generally when data is read from memory it will be read in chunks, so values that are contiguous will be read together.\nHow does this inform one’s programming? For example, if you have a matrix of values stored in row-major order, computing on a row will be a lot faster than computing on a column, because the row can be read into the cache from main memory and then accessed in the cache. In contrast, if the matrix is large and therefore won’t fit in the cache, when you access the values of a column, you’ll have to go to main memory repeatedly to get the values for the row because the values are not stored contiguously.\nThere’s a nice example of the importance of the cache at the bottom of this blog post.\nIf you know the size of the cache, you can try to design your code so that in a given part of your code you access data structures that will fit in the cache. This sort of thing is generally more relevant if you’re coding in a language like C. But it can matter sometimes in interpreted languages too.\nLet’s see what happens in Python.\n\nnr = 800000\nnc = 100\n\nA = np.random.normal(size=(nr, nc))\ntA = np.random.normal(size=(nc, nr)) \n\n## Verify that A is row-major using `.flags` (notice the `C_CONTIGUOUS: True`).\nA.flags\n\n C_CONTIGUOUS : True\n F_CONTIGUOUS : False\n OWNDATA : True\n WRITEABLE : True\n ALIGNED : True\n WRITEBACKIFCOPY : False\n\n\nNote that I didn’t use A.T or np.transpose as that doesn’t make a copy in memory and so the transposed matrix doesn’t end up being row-major. You can use A.flags and A.T.flags` to see this.\nNow let’s time things.\n\n# Define the mean calculations as functions\ndef mean_by_column():\n return np.mean(A, axis=0)\n\ndef mean_by_row():\n return np.mean(tA, axis=1)\n\ntimeit.timeit(mean_by_column, number=10)\n\n0.5146233681589365\n\ntimeit.timeit(mean_by_row, number=10)\n\n0.4034226518124342\n\n\nSuppose we instead do the looping manually.\n\ntimeit.timeit('[np.mean(A[:,col]) for col in range(A.shape[1])]',\n setup = 'import numpy as np', number=10, globals = {'A': A}) # 5.10 sec.\n\n5.463273551315069\n\ntimeit.timeit('[np.mean(tA[row,:]) for row in range(tA.shape[0])]',\n setup = 'import numpy as np', number=10, globals = {'tA': tA}) # 5.06 sec.\n\n0.42474850825965405\n\n\nIndeed, the row-wise calculations are faster, particularly if we do it manually. So that suggests numpy might be doing something clever in its implementation of mean with the axis argument.\n\nChallenge: suppose you were writing code for this kind of use case. How could you set up your calculations to do either row-wise or column-wise operations in a way that processes each number sequentially based on the order in which the numbers are stored. For example suppose the values are stored row-major but you want the column sums.\n\nWhen we define a numpy array, we can choose to use column-major order (i.e., “Fortran” order) with the order argument.\n\n\nLoop fusion\nLet’s consider this (vectorized) code:\n\nx = np.exp(x) + 3*np.sin(x)\n\nThis code has some downsides.\n\nThink about whether any additional memory has to be allocated.\nThink about how many for loops will have to get executed.\n\nContrast that to running directly as a for loop (e.g., here in Julia or in C/C++):\n\nfor i in 1:length(x)\n x[i] = exp(x[i]) + 3*sin(x[i])\nend\n\nHow does that affect the downsides mentioned above?\nCombining loops is called ‘fusing’ and is an important optimization that Julia can do, as shown in this demo. It’s also a key optimization done by XLA, a compiler used with Tensorflow, so one approach to getting loop fusion in Python is to use Tensorflow for such calculations within Python rather than simply using numpy.\n\n\nHashing (including name lookup)\nAbove I mentioned that Python uses hashing to store and lookup values by key in a dictionary. I’ll briefly describe what hashing is here, because it is a commonly-used strategy in programming in general.\nA hash function is a function that takes as input some data and maps it to a fixed-length output that can be used as a shortened reference to the data. (The function should be deterministic, always returing the same output for a given input.) We’ve seen this in the context of git commits where each commit was labeled with a long base-16 number. This also comes up when verifying files on the Internet. You can compute the hash value on the file you get and check that it is the same as the hash value associated with the legitimate copy of the file.\nWhile there are various uses of hashing, for our purposes here, hashing can allow one to look up values by their name via a hash table. The idea is that you have a set of key-value pairs (sometimes called a dictionary) where the key is the name associated with the value and the value is some arbitrary object. You want to be able to quickly find the value/object quickly.\nHashing allows one to quickly determine an index associated with the key and therefore quickly find the relevant value based on the index. For example, one approach is to compute the hash as a function of the key and then take the remainder when dividing by the number of possible results (here the fact that the result is a fixed-length output is important) to get the index. Here’s the procedure in pseudocode:\n hash = hashfunc(key) \n index = hash %% array_size \n ## %% is modulo operator - it gives the remainder\nIn general, there will be collisions – multiple keys will be assigned to the same index. However with a good hash function, usually there will be a small number of keys associated with a given bucket. So each bucket will contain a list of a small number of values and the associated keys. (The buckets might contain the actual values or they might contain the addresses of where the values are actually stored if the values are complicated objects.) Then determining the correct value (or the required address) within a given bucket is fast even with simple linear search through the items one by one. Put another way, the hash function distributes the keys amongst an array of buckets and allows one to look up the appropriate bucket quickly based on the computed index value. When the hash table is properly set up, the cost of looking up a value does not depend on the number of key-value pairs stored.\nPython uses hashing to look up the value based on the key in a given dictionary, and similarly when looking up variables in namespaces. This allows Python to retrieve objects very quickly.\n\n\nLazy evaluation\nWhat’s strange about this R code?\n\nf <- function(x) print(\"hi\")\nsystem.time(mean(rnorm(1000000)))\n\n user system elapsed \n 0.06 0.00 0.06 \n\nsystem.time(f(3))\n\n[1] \"hi\"\n\n\n user system elapsed \n 0 0 0 \n\nsystem.time(f(mean(rnorm(1000000)))) \n\n[1] \"hi\"\n\n\n user system elapsed \n 0.001 0.000 0.001 \n\n\nLazy evaluation is not just an R thing. It also occurs in Tensorflow (particularly version 1), the Python Dask package, and in Spark. The basic idea is to delay executation until it’s really needed, with the goal that if one does so, the system may be able to better optimize a series of multiple steps as a joint operation relative to executing them one by one.\nHowever, Python itself does not have lazy evaluation." }, { "objectID": "units/unit2-dataTech.html", diff --git a/sitemap.xml b/sitemap.xml index 0e029ad..a7632c0 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,130 +2,130 @@ https://github.com/berkeley-stat243/stat243-fall-2023/publish.html - 2023-09-20T22:49:13.791Z + 2023-09-22T19:18:27.576Z https://github.com/berkeley-stat243/stat243-fall-2023/ps/ps1.html - 2023-09-20T22:49:10.035Z + 2023-09-22T19:18:24.156Z https://github.com/berkeley-stat243/stat243-fall-2023/ps/regex.html - 2023-09-20T22:49:01.806Z + 2023-09-22T19:18:16.620Z https://github.com/berkeley-stat243/stat243-fall-2023/rubric.html - 2023-09-20T22:48:57.206Z + 2023-09-22T19:18:12.380Z https://github.com/berkeley-stat243/stat243-fall-2023/labs/lab3-debugging.html - 2023-09-20T22:48:51.482Z + 2023-09-22T19:18:07.136Z https://github.com/berkeley-stat243/stat243-fall-2023/labs/lab1-submission.html - 2023-09-20T22:48:43.898Z + 2023-09-22T19:18:00.248Z https://github.com/berkeley-stat243/stat243-fall-2023/units/unit1-intro.html - 2023-09-20T22:48:34.682Z + 2023-09-22T19:17:52.044Z https://github.com/berkeley-stat243/stat243-fall-2023/units/unit9-sim.html - 2023-09-20T22:48:15.166Z + 2023-09-22T19:17:34.252Z https://github.com/berkeley-stat243/stat243-fall-2023/units/unit6-parallel.html - 2023-09-20T22:47:58.001Z + 2023-09-22T19:17:18.516Z https://github.com/berkeley-stat243/stat243-fall-2023/units/unit5-programming.html - 2023-09-20T22:47:25.060Z + 2023-09-22T19:16:48.492Z https://github.com/berkeley-stat243/stat243-fall-2023/units/unit2-dataTech.html - 2023-09-20T22:46:50.967Z + 2023-09-22T19:16:16.112Z https://github.com/berkeley-stat243/stat243-fall-2023/syllabus.html - 2023-09-20T22:46:32.038Z + 2023-09-22T19:15:58.436Z https://github.com/berkeley-stat243/stat243-fall-2023/howtos/windowsAndLinux.html - 2023-09-20T22:46:30.658Z + 2023-09-22T19:15:57.108Z https://github.com/berkeley-stat243/stat243-fall-2023/howtos/quartoInstall.html - 2023-09-20T22:46:29.498Z + 2023-09-22T19:15:55.956Z https://github.com/berkeley-stat243/stat243-fall-2023/howtos/gitInstall.html - 2023-09-20T22:46:28.594Z + 2023-09-22T19:15:55.076Z https://github.com/berkeley-stat243/stat243-fall-2023/howtos/accessingUnixCommandLine.html - 2023-09-20T22:46:27.642Z + 2023-09-22T19:15:54.164Z https://github.com/berkeley-stat243/stat243-fall-2023/index.html - 2023-09-20T22:46:26.774Z + 2023-09-22T19:15:53.256Z https://github.com/berkeley-stat243/stat243-fall-2023/howtos/ps-submission.html - 2023-09-20T22:46:28.190Z + 2023-09-22T19:15:54.680Z https://github.com/berkeley-stat243/stat243-fall-2023/howtos/windowsInstall.html - 2023-09-20T22:46:29.138Z + 2023-09-22T19:15:55.616Z https://github.com/berkeley-stat243/stat243-fall-2023/howtos/RandRStudioInstall.html - 2023-09-20T22:46:29.906Z + 2023-09-22T19:15:56.364Z https://github.com/berkeley-stat243/stat243-fall-2023/howtos/accessingPython.html - 2023-09-20T22:46:31.066Z + 2023-09-22T19:15:57.488Z https://github.com/berkeley-stat243/stat243-fall-2023/units/unit10-linalg.html - 2023-09-20T22:46:39.531Z + 2023-09-22T19:16:05.524Z https://github.com/berkeley-stat243/stat243-fall-2023/units/unit7-bigData.html - 2023-09-20T22:47:02.528Z + 2023-09-22T19:16:27.000Z https://github.com/berkeley-stat243/stat243-fall-2023/units/unit4-goodPractices.html - 2023-09-20T22:47:47.909Z + 2023-09-22T19:17:09.284Z https://github.com/berkeley-stat243/stat243-fall-2023/units/unit3-bash.html - 2023-09-20T22:48:06.469Z + 2023-09-22T19:17:26.328Z https://github.com/berkeley-stat243/stat243-fall-2023/units/unit8-numbers.html - 2023-09-20T22:48:24.050Z + 2023-09-22T19:17:42.488Z https://github.com/berkeley-stat243/stat243-fall-2023/labs/lab2-testing.html - 2023-09-20T22:48:39.986Z + 2023-09-22T19:17:56.748Z https://github.com/berkeley-stat243/stat243-fall-2023/labs/lab0-setup.html - 2023-09-20T22:48:47.622Z + 2023-09-22T19:18:03.604Z https://github.com/berkeley-stat243/stat243-fall-2023/schedule.html - 2023-09-20T22:48:56.750Z + 2023-09-22T19:18:11.920Z https://github.com/berkeley-stat243/stat243-fall-2023/ps/ps2.html - 2023-09-20T22:48:58.178Z + 2023-09-22T19:18:13.344Z https://github.com/berkeley-stat243/stat243-fall-2023/ps/ps3.html - 2023-09-20T22:49:05.798Z + 2023-09-22T19:18:20.288Z https://github.com/berkeley-stat243/stat243-fall-2023/office_hours.html - 2023-09-20T22:49:13.407Z + 2023-09-22T19:18:27.180Z diff --git a/units/unit1-intro.pdf b/units/unit1-intro.pdf index f60b021..9e94c98 100644 Binary files a/units/unit1-intro.pdf and b/units/unit1-intro.pdf differ diff --git a/units/unit10-linalg.pdf b/units/unit10-linalg.pdf index e2393cc..7a0936e 100644 Binary files a/units/unit10-linalg.pdf and b/units/unit10-linalg.pdf differ diff --git a/units/unit2-dataTech.pdf b/units/unit2-dataTech.pdf index 4d43802..67b83e5 100644 Binary files a/units/unit2-dataTech.pdf and b/units/unit2-dataTech.pdf differ diff --git a/units/unit3-bash.pdf b/units/unit3-bash.pdf index 7f2fed3..9c7487c 100644 Binary files a/units/unit3-bash.pdf and b/units/unit3-bash.pdf differ diff --git a/units/unit4-goodPractices.pdf b/units/unit4-goodPractices.pdf index f4d972c..2d447bf 100644 Binary files a/units/unit4-goodPractices.pdf and b/units/unit4-goodPractices.pdf differ diff --git a/units/unit5-programming.html b/units/unit5-programming.html index 737da42..5cba982 100644 --- a/units/unit5-programming.html +++ b/units/unit5-programming.html @@ -436,8 +436,8 @@

On this page

  • Lexical scoping (enclosing scopes)
  • Global and non-local variables
  • Closures
  • -
  • Decorators
  • +
  • Decorators
  • 8. Memory and copies -

    Vectors in C are really pointers to a block of memory:

    +

    Arrays in C are really pointers to a block of memory:

    int x[10];
    @@ -2413,7 +2413,7 @@

    Namespaces and scope
  • Global (aka ‘module’) scope: objects available in the module in which the function is defined (which may simply be the default global scope when you start the Python interpreter). This is also the local scope if the code is not executing inside a function.
  • Built-ins scope: objects provided by Python through the built-ins module but available from anywhere.
  • -

    Note that import adds the name of the imported module in the local scope.

    +

    Note that import adds the name of the imported module to the namespace of the current (i.e., local) scope.

    We can see the local and global namespaces using locals() and globals().

    cat local.py
    @@ -2513,16 +2513,18 @@

    Lexical s x = 7 f2() -f() # what will happen? - -## Case 4 -x = 3 -def f(): - def f2(): - print(x) - f2() - -f() # what will happen?

    +x = 100 +f() # what will happen? + +## Case 4 +x = 3 +def f(): + def f2(): + print(x) + f2() + +x = 100 +f() # what will happen?

    Here’s a tricky example:

    @@ -2536,56 +2538,61 @@

    Lexical s ## fun_constructor() creates functions myfun = fun_constructor() myfun(3)

    -
    -
    13
    -

    Let’s work through this:

    1. What is the enclosing scope for the function g()?
    2. -
    3. What does g() use for y?
    4. -
    5. Where is myfun defined (this is tricky)?
    6. -
    7. What is the enclosing (scope) for myfun()?
    8. +
    9. Which y does g() use?
    10. +
    11. Where is myfun defined (this is tricky – how does myfun relate to g)?
    12. +
    13. What is the enclosing scope for myfun()?
    14. When fun_constructor() finishes, does its namespace disappear? What would happen if it did?
    15. What does myfun use for y?
    -

    [TODO: is there a way to access variables in a closure (i.e., in this case access ‘y’ based on myfun). Is there a way to find the enclosing scope for ‘myfun’ and look in it? See the inspect package]

    +

    We can use the inspect package to see information about the closure.

    +
    +
    import inspect
    +inspect.getclosurevars(myfun)
    +
    +
    ClosureVars(nonlocals={}, globals={'copy': <module 'copy' from '/system/linux/mambaforge-3.11/lib/python3.11/copy.py'>}, builtins={}, unbound=set())
    +
    +
    +

    (Note that I haven’t fully investigated the use of inspect, but it looks like it has a lot of useful tools.)

    Be careful when using variables from non-local scopes as the value of that variable may well not be what you expect it to be. In general one wants to think carefully before using variables that are taken from outside the local scope, but in some cases it can be useful.

    -

    Next we’ll see some ways of accessing variables outside of the local scope and of enclosing scopes.

    +

    Next we’ll see some ways of accessing variables outside of the local scope.

    Global and non-local variables

    We can create and modify global variables and variables in the enclosing scope using global and nonlocal respectively. Note that global is in the context of the current module so this could be a variable in your current Python session if you’re working with functions defined in that session or a global variable in a module or package.

    -
    del x
    -
    -def myfun():
    -    global x
    -    x = 7
    -
    -myfun()
    -print(x)
    +
    del x
    +
    +def myfun():
    +    global x
    +    x = 7
    +
    +myfun()
    +print(x)
    7
    -
    x = 9
    -myfun()
    -print(x)
    +
    x = 9
    +myfun()
    +print(x)
    7
    -
    def outer_function():
    -    x = 10  # Outer variable
    -    def inner_function():
    -        nonlocal x
    -        x = 20  # Modifying the outer variable
    -    print(x)  # Output: 10
    -    inner_function()
    -    print(x)  # Output: 20
    -
    -outer_function()
    +
    def outer_function():
    +    x = 10  # Outer variable
    +    def inner_function():
    +        nonlocal x
    +        x = 20  # Modifying the outer variable
    +    print(x)  # Output: 10
    +    inner_function()
    +    print(x)  # Output: 20
    +
    +outer_function()
    10
     20
    @@ -2596,22 +2603,22 @@

    Global and

    Closures

    One way to associate data with functions is to use a closure. This is a functional programming way to achieve something like an OOP class. This Wikipedia entry nicely summarizes the idea, which is a general functional programming idea and not specific to Python.

    -

    Using a closure involves creating one (or more functions) within a function call and returning the function(s) as the output. When one executes the original function, the new function(s) is created and returned and one can then call that new function(s). The new function then can access objects in the enclosing scope (the scope of the original function) and can use nonlocal to assign into the enclosing scope, to which the function (or the multiple functions) have access. The nice thing about this compared to using a global variable is that the data in the closure is bound up with the function(s) and is protected from being changed by the user.

    -
    -
    x = np.random.normal(size = 5)
    -def scaler_constructor(input):
    -    data = input
    -    def g(param):
    -            return(param * data) 
    -    return(g)
    -
    -scaler = scaler_constructor(x)
    -del x # to demonstrate we no longer need x
    -scaler(3)
    +

    Using a closure involves creating one (or more functions) within a function call and returning the function(s) as the output. When one executes the original function (the constructor), the new function(s) is created and returned and one can then call that function(s). The function then can access objects in the enclosing scope (the scope of the constructor) and can use nonlocal to assign into the enclosing scope, to which the function (or the multiple functions) have access. The nice thing about this compared to using a global variable is that the data in the closure is bound up with the function(s) and is protected from being changed by the user.

    +
    +
    x = np.random.normal(size = 5)
    +def scaler_constructor(input):
    +    data = input
    +    def g(param):
    +            return(param * data) 
    +    return(g)
    +
    +scaler = scaler_constructor(x)
    +del x # to demonstrate we no longer need x
    +scaler(3)
    array([ 0.5081473 ,  2.22166935, -2.86110181, -0.79865552,  0.09784364])
    -
    scaler(6)
    +
    scaler(6)
    array([ 1.0162946 ,  4.44333871, -5.72220361, -1.59731104,  0.19568728])
    @@ -2620,65 +2627,66 @@

    Closures

    Note that it can be hard to see the memory use involved in the closure.

    Here’s a more realistic example. There are other ways you could do this, but this is slick:

    -
    def make_container(n):
    -    x = np.zeros(n)
    -    i = 0
    -    def store(value = None):
    -        nonlocal x, i
    -        if value is None:
    -            return x
    -        else:
    -            x[i] = value
    -            i += 1
    -    return store
    -
    -
    -nboot = 20
    -bootmeans = make_container(nboot)
    -
    -import pandas as pd
    -iris = pd.read_csv('https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/io/data/csv/iris.csv')
    -data = iris['SepalLength']
    -
    -for i in range(nboot): 
    -    bootmeans(np.mean(np.random.choice(data, size = len(data), replace = True)))
    -
    -
    -bootmeans()
    +
    def make_container(n):
    +    x = np.zeros(n)
    +    i = 0
    +    def store(value = None):
    +        nonlocal x, i
    +        if value is None:
    +            return x
    +        else:
    +            x[i] = value
    +            i += 1
    +    return store
    +
    +
    +nboot = 20
    +bootmeans = make_container(nboot)
    +
    +import pandas as pd
    +iris = pd.read_csv('https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/io/data/csv/iris.csv')
    +data = iris['SepalLength']
    +
    +for i in range(nboot): 
    +    bootmeans(np.mean(np.random.choice(data, size = len(data), replace = True)))
    +
    +
    +bootmeans()
    array([5.874     , 5.95533333, 5.86      , 5.754     , 5.77466667,
            5.81733333, 5.902     , 5.79933333, 5.842     , 5.81933333,
            5.854     , 5.97      , 5.84      , 5.80133333, 5.99333333,
            5.88133333, 5.83133333, 5.84533333, 5.91466667, 5.79666667])
    -
    bootmeans.__closure__
    +
    bootmeans.__closure__
    -
    (<cell at 0x7f9cc6b362c0: int object at 0x7f9cd7aef968>, <cell at 0x7f9cc6b36680: numpy.ndarray object at 0x7f9cc6c76d30>)
    +
    (<cell at 0x7fbec758ded0: int object at 0x7fbedc5f3968>, <cell at 0x7fbec758e290: numpy.ndarray object at 0x7fbec76ced30>)
    -
    -

    Decorators

    +
    +
    +

    Decorators

    Now that we’ve seen function generators, it’s straightforward to discuss decorators.

    A decorator is a wrapper around a function that extends the functionality of the function without actually modifying the function.

    We can create a simple decorator “manually” like this:

    -
    def verbosity_wrapper(myfun):
    -    def wrapper(*args, **kwargs):
    -        print(f"Starting {myfun.__name__}.")
    -        output = myfun(*args, **kwargs)
    -        print(f"Finishing {myfun.__name__}.")
    -        return output
    -    return wrapper
    -    
    -verbose_rnorm = verbosity_wrapper(np.random.normal)
    -
    -x = verbose_rnorm(size = 5)
    +
    def verbosity_wrapper(myfun):
    +    def wrapper(*args, **kwargs):
    +        print(f"Starting {myfun.__name__}.")
    +        output = myfun(*args, **kwargs)
    +        print(f"Finishing {myfun.__name__}.")
    +        return output
    +    return wrapper
    +    
    +verbose_rnorm = verbosity_wrapper(np.random.normal)
    +
    +x = verbose_rnorm(size = 5)
    Starting normal.
     Finishing normal.
    -
    x
    +
    x
    array([ 0.07202449, -0.67672674,  0.98248139, -1.65748762,  0.81795808])
    @@ -2686,16 +2694,16 @@

    Decorators

    Python provides syntax that helps you create decorators with less work (this is an example of the general idea of syntactic sugar).

    We can easily apply our decorator defined above to a function as follows. Now the function name refers to the wrapped version of the function.

    -
    @verbosity_wrapper
    -def myfun(x):
    -    return x
    -
    -y = myfun(7)
    +
    @verbosity_wrapper
    +def myfun(x):
    +    return x
    +
    +y = myfun(7)
    Starting myfun.
     Finishing myfun.
    -
    y
    +
    y
    7
    @@ -2704,19 +2712,18 @@

    Decorators

    One real-world example of using decorators is in setting up functions to run in parallel in dask, which we’ll discuss in Unit 7.

    -

    8. Memory and copies

    Overview

    The main things to remember when thinking about memory use are: (1) numeric vectors take 8 bytes per element and (2) we need to keep track of when large objects are created, including local variables in the frames of functions.

    -
    x = np.random.normal(size = 5)
    -x.itemsize # 8 bytes
    +
    x = np.random.normal(size = 5)
    +x.itemsize # 8 bytes
    8
    -
    x.nbytes
    +
    x.nbytes
    40
    @@ -2761,47 +2768,47 @@

    Monitoring

    There are a number of ways to see how much memory is being used. When Python is actively executing statements, you can use top from the UNIX shell.

    In Python, we can call out to the system to get the info we want:

    -
    import psutil
    -
    -# Get memory information
    -memory_info = psutil.Process().memory_info()
    -
    -# Print the memory usage
    -print("Memory usage:", memory_info.rss/10**6, " Mb.")
    -
    -# Let's turn that into a function for later use:
    +
    import psutil
    +
    +# Get memory information
    +memory_info = psutil.Process().memory_info()
    +
    +# Print the memory usage
    +print("Memory usage:", memory_info.rss/10**6, " Mb.")
    +
    +# Let's turn that into a function for later use:
    -
    Memory usage: 470.368256  Mb.
    +
    Memory usage: 472.645632  Mb.
    -
    def mem_used():
    -    print("Memory usage:", psutil.Process().memory_info().rss/10**6, " Mb.")
    +
    def mem_used():
    +    print("Memory usage:", psutil.Process().memory_info().rss/10**6, " Mb.")

    We can see the size of an object (in bytes) with sys.getsizeof().

    -
    my_list = [1, 2, 3, 4, 5]
    -sys.getsizeof(my_list)
    +
    my_list = [1, 2, 3, 4, 5]
    +sys.getsizeof(my_list)
    104
    -
    x = np.random.normal(size = 10**7) # should use about 80 Mb
    -sys.getsizeof(x)
    +
    x = np.random.normal(size = 10**7) # should use about 80 Mb
    +sys.getsizeof(x)
    80000112

    However, we need to be careful about objects that refer to other objects:

    -
    y = [3, x]
    -sys.getsizeof(y)  # Whoops!
    +
    y = [3, x]
    +sys.getsizeof(y)  # Whoops!
    72

    Here’s a trick where we serialize the object, as if to export it, and then see how long the binary representation is.

    -
    import pickle
    -ser_object = pickle.dumps(y)
    -sys.getsizeof(ser_object)
    +
    import pickle
    +ser_object = pickle.dumps(y)
    +sys.getsizeof(ser_object)
    80000201
    @@ -2815,34 +2822,34 @@

    How memory is

    Two key tools: id and is

    We can use the id function to see where in memory an object is stored and is to see if two object are actually the same objects in memory. It’s particularly useful for understanding storage and memory use for complicated data structures. We’ll also see that they can be handy tools for seeing where copies are made and where they are not.

    -
    x = np.random.normal(size = 10**7)
    -id(x)
    +
    x = np.random.normal(size = 10**7)
    +id(x)
    -
    140311150938704
    +
    140457191377488
    -
    sys.getsizeof(x)
    +
    sys.getsizeof(x)
    80000112
    -
    y = x
    -id(y)
    +
    y = x
    +id(y)
    -
    140311150938704
    +
    140457191377488
    -
    x is y
    +
    x is y
    True
    -
    sys.getsizeof(y)
    +
    sys.getsizeof(y)
    80000112
    -
    z = x.copy()
    -id(z)
    +
    z = x.copy()
    +id(z)
    -
    140311153128912
    +
    140457193549392
    -
    sys.getsizeof(z)
    +
    sys.getsizeof(z)
    80000112
    @@ -2854,49 +2861,49 @@

    Memor

    How lists are stored

    Here we can use id to determine how the overall list is stored as well as the elements of the list.

    -
    nums = np.random.normal(size = 5)
    -obj = [nums, nums, np.random.normal(size = 5), ['adfs']]
    -
    -id(nums)
    +
    nums = np.random.normal(size = 5)
    +obj = [nums, nums, np.random.normal(size = 5), ['adfs']]
    +
    +id(nums)
    -
    140311150937936
    +
    140457191376720
    -
    id(obj)
    +
    id(obj)
    -
    140311325165888
    +
    140457364871680
    -
    id(obj[0])
    +
    id(obj[0])
    -
    140311150937936
    +
    140457191376720
    -
    id(obj[1])
    +
    id(obj[1])
    -
    140311150937936
    +
    140457191376720
    -
    id(obj[2])
    +
    id(obj[2])
    -
    140311150945712
    +
    140457191384496
    -
    id(obj[3])
    +
    id(obj[3])
    -
    140311151627584
    +
    140457192017152
    -
    id(obj[3][0])
    +
    id(obj[3][0])
    -
    140311151630320
    +
    140457192019632
    -
    obj[0] is obj[1]
    +
    obj[0] is obj[1]
    True
    -
    obj[0] is obj[2]
    +
    obj[0] is obj[2]
    False

    What do we notice?

      -
    • The list itself appears to be a array of pointers to the component elements.
    • +
    • The list itself appears to be a array of references (pointers) to the component elements.
    • Each element has its own address.
    • Two elements of a list can use the same memory (see the first two elements here, whose contents are at the same memory address).
    • A list element can use the same memory as another object (or part of another object).
    • @@ -2910,15 +2917,15 @@

      How char

      Modifying elements in place

      What do this simple experiment tell us?

      -
      x = np.random.normal(size = 5)
      -id(x)
      +
      x = np.random.normal(size = 5)
      +id(x)
      -
      140311150946768
      +
      140457191385552
      -
      x[2] = 3.5
      -id(x)
      +
      x[2] = 3.5
      +id(x)
      -
      140311150946768
      +
      140457191385552

      It makes some sense that modifying elements of an object here doesn’t cause a copy – if it did, working with large objects would be very difficult.

      @@ -2928,20 +2935,20 @@

      Modifying elem

      When are copies made?

      Let’s try to understand when Python uses additional memory for objects, and how it knows when it can delete memory. We’ll use large objects so that we can use free or top to see how memory use by the Python process changes.

      -
      x = np.random.normal(size = 10**8)
      -id(x)
      +
      x = np.random.normal(size = 10**8)
      +id(x)
      -
      140311151333360
      +
      140457191755760
      -
      y = x
      -id(y)
      +
      y = x
      +id(y)
      -
      140311151333360
      +
      140457191755760
      -
      x = np.random.normal(size = 10**8)
      -id(x)
      +
      x = np.random.normal(size = 10**8)
      +id(x)
      -
      140311150946768
      +
      140457191385552

      Only if we re-assign x to reference a different object does additional memory get used.

      @@ -2949,49 +2956,49 @@

      When are copies made?

      How does Python know when it can free up memory?

      Python keeps track of how many names refer to an object and only removes memory when there are no remaining references to an object.

      -
      import sys
      -
      -x = np.random.normal(size = 10**8)
      -y = x
      -sys.getrefcount(y)
      +
      import sys
      +
      +x = np.random.normal(size = 10**8)
      +y = x
      +sys.getrefcount(y)
      3
      -
      del x
      -sys.getrefcount(y)
      +
      del x
      +sys.getrefcount(y)
      2
      -
      del y
      +
      del y

      We can see the number of references using sys.getrefcount. Confusingly, the number is one higher than we’d expect, because it includes the temporary reference from passing the object as the argument to getrefcount.

      -
      x = np.random.normal(size = 5)
      -sys.getrefcount(x)  # 2 
      +
      x = np.random.normal(size = 5)
      +sys.getrefcount(x)  # 2 
      2
      -
      y = x
      -sys.getrefcount(x)  # 3
      +
      y = x
      +sys.getrefcount(x)  # 3
      3
      -
      sys.getrefcount(y)  # 3
      +
      sys.getrefcount(y)  # 3
      3
      -
      del y
      -sys.getrefcount(x)  # 2
      +
      del y
      +sys.getrefcount(x)  # 2
      2
      -
      y = x
      -x = np.random.normal(size = 5)
      -sys.getrefcount(y)  # 2
      +
      y = x
      +x = np.random.normal(size = 5)
      +sys.getrefcount(y)  # 2
      2
      -
      sys.getrefcount(x)  # 2
      +
      sys.getrefcount(x)  # 2
      2
      @@ -3002,7 +3009,7 @@

      Strategies for saving memory

      -

      A couple basic strategies for saving memory include:

      +

      A frew basic strategies for saving memory include:

      • Avoiding unnecessary copies.
      • Removing objects that are not being used, at which point the Python garbage collector should free up the memory.
      • @@ -3011,33 +3018,35 @@

        Strategies fo
        • Using types that take up less memory (e.g., Bool, Int16, Float32) when possible.

          -
          x = np.array(np.random.normal(size = 5), dtype = "float32")
          -x.itemsize
          +
          x = np.array(np.random.normal(size = 5), dtype = "float32")
          +x.itemsize
          4
          -
          x = np.array([3,4,2,-2], dtype = "int16")
          -x.itemsize
          +
          x = np.array([3,4,2,-2], dtype = "int16")
          +x.itemsize
          2
        • +
        • Reading data in from files in chunks rather than reading the entire dataset (more in Unit 7).

        • +
        • Exploring packages such as arrow for efficiently using memory, as discussed in Unit 2.

    Example

    -

    Let’s work through a real example where we keep a running tally of current memory in use and maximum memory used in a function call. We’ll want to consider hidden uses of memory, when copies are made, and lazy evaluation. This code (translated from the original R code) is courtesy of Yuval Benjamini. For our purposes here, let’s assume that xvar and yvar are very long vectors using a lot of memory.

    +

    Let’s work through a real example where we keep a running tally of current memory in use and maximum memory used in a function call. We’ll want to consider hidden uses of memory, when copies are made, and lazy evaluation. This code (translated from the original R code) is courtesy of Yuval Benjamini. For our purposes here, let’s assume that xvar and yvar are very long numpy arrays using a lot of memory.

    -
    def fastcount(xvar, yvar):
    -    naline = np.isnan(xvar)
    -    naline[np.isnan(yvar)] = True
    -    localx = xvar
    -    localy = yvar
    -    localx[naline] = 0
    -    localy[naline] = 0
    -    useline = ~naline
    -    ## We'll ignore the rest of the code.
    -    ## ....
    +
    def fastcount(xvar, yvar):
    +    naline = np.isnan(xvar)
    +    naline[np.isnan(yvar)] = True
    +    localx = xvar.copy()
    +    localy = yvar.copy()
    +    localx[naline] = 0
    +    localy[naline] = 0
    +    useline = ~naline
    +    ## We'll ignore the rest of the code.
    +    ## ....
    @@ -3052,26 +3061,26 @@

    Why are

    We’ll focus on Python in the following discussion, but most of the concepts apply to other interpreted languages.

    For example, consider this code:

    -
    x = 3
    -abs(x)
    -x*7
    -abs(x)
    -x = 'hi'
    -try:
    -    abs(x)
    -except Exception as error:
    -    print(error)
    -x*3
    +
    x = 3
    +abs(x)
    +x*7
    +abs(x)
    +x = 'hi'
    +try:
    +    abs(x)
    +except Exception as error:
    +    print(error)
    +x*3

    Because of dynamic typing, when the interpreter sees abs(x) it needs to check if x is something to on the absolute value function be used, including dealing with the fact that x could be a list or array with many numbers in it. In addition it needs to (using scoping rules) look up the value of x. (Consider that x might not even exist at the point that abs(x) is called.) Only then can the absolute value calculation happen. For the multiplication, Python needs to lookup the version of * that can be used, depending on the type of x.

    Let’s consider writing a loop:

    -
    for i in range(10):
    -    if np.random.normal(size = 1) > 0:
    -        x = 'hi'
    -    if np.random.normal(size = 1) > 0.5:
    -        del x
    -    x[i] = np.exp(x[i])
    +
    for i in range(10):
    +    if np.random.normal(size = 1) > 0:
    +        x = 'hi'
    +    if np.random.normal(size = 1) > 0.5:
    +        del x
    +    x[i] = np.exp(x[i])

    There is no way around the fact that because of how dynamic this is, the interpreter needs to check if x exists, if it is a vector of sufficient length, if it contains numeric values, and it needs to go retrieve the required value, EVERY TIME the np.exp() is executed. Now the code above is unusual, and in most cases, we wouldn’t have the if statements that modify x. So you could imagine a process by which the checking were done on the first iteration and then not needed after that – that gets into the idea of just-in-time compilation, discussed later.

    The standard Python interpreter (CPython) is a C function so in some sense everything that happens is running as compiled code, but there are lots more things being done to accomplish a given task using interpreted code than if the task had been written directly in code that is compiled. By analogy, consider talking directly to a person in a language you both know compared to talking to a person via an interpreter who has to translate between two languages. Ultimately, the same information gets communicated (hopefully!) but the number of words spoken and time involved is much greater.

    @@ -3107,51 +3116,51 @@

    Byte compiling (op

    If you look at the file names in the directory of an installed Python package you may see files with the .pyc extension. These files have been byte-compiled.

    We can byte compile our own functions using either the py_compile or compileall modules. Here’s an example (silly since as experienced Python programmers, we would use vectorized calculation here rather than this unvectorized code.)

    -
    import time
    -
    -def f(vals):
    -    x = np.zeros(len(vals))
    -    for i in range(len(vals)):
    -        x[i] = np.exp(vals[i])
    -    return(x)
    -
    -x = np.random.normal(size = 10**6)
    -t0 = time.time()
    -out = f(x)
    -time.time() - t0
    +
    import time
    +
    +def f(vals):
    +    x = np.zeros(len(vals))
    +    for i in range(len(vals)):
    +        x[i] = np.exp(vals[i])
    +    return(x)
    +
    +x = np.random.normal(size = 10**6)
    +t0 = time.time()
    +out = f(x)
    +time.time() - t0
    -
    0.7586061954498291
    +
    0.8189809322357178
    -
    t0 = time.time()
    -out = np.exp(x)
    -time.time() - t0
    +
    t0 = time.time()
    +out = np.exp(x)
    +time.time() - t0
    -
    0.01432943344116211
    +
    0.012688636779785156
    -
    import py_compile
    -py_compile.compile('vec.py')
    +
    import py_compile
    +py_compile.compile('vec.py')
    '__pycache__/vec.cpython-311.pyc'
    -
    cp __pycache__/vec.cpython-311.pyc vec.pyc
    -rm vec.py    # make sure non-compiled module not loaded
    +
    cp __pycache__/vec.cpython-311.pyc vec.pyc
    +rm vec.py    # make sure non-compiled module not loaded
    -
    import vec
    -vec.__file__
    -    
    +
    import vec
    +vec.__file__
    +    
    '/accounts/vis/paciorek/teaching/243fall23/stat243-fall-2023/units/vec.pyc'
    -
    t0 = time.time()
    -out = vec.f(x)
    -time.time() - t0
    +
    t0 = time.time()
    +out = vec.f(x)
    +time.time() - t0
    -
    0.9548311233520508
    +
    0.8231360912322998

    Unfortunately, as seen above byte compiling may not speed things up much. I’m not sure why.

    @@ -3165,40 +3174,41 @@

    Benchmarking an

    Timing your code

    There are a few ways to time code:

    -
    import time
    -t0 = time.time()
    -x = 3
    -t1 = time.time()
    -
    -print(f"Execution time: {t1-t0} seconds.")
    +
    import time
    +t0 = time.time()
    +x = 3
    +t1 = time.time()
    +
    +print(f"Execution time: {t1-t0} seconds.")
    -
    Execution time: 0.006555795669555664 seconds.
    +
    Execution time: 0.004503488540649414 seconds.

    In general, it’s a good idea to repeat (replicate) your timing, as there is some stochasticity in how fast your computer will run a piece of code at any given moment.

    Using time is fine for code that takes a little while to run, but for code that is really fast, it may not be very accurate. Measuring fast bits of code is tricky to do well. This next approach is better for benchmarking code (particularly faster bits of code).

    -
    import timeit
    -
    -timeit.timeit('x = np.exp(3.)', setup = 'import numpy as np', number = 100)
    +
    import timeit
    +
    +timeit.timeit('x = np.exp(3.)', setup = 'import numpy as np', number = 100)
    -
    7.695332169532776e-05
    +
    0.00011431612074375153
    -
    code = '''
    -x = np.exp(3.)
    -'''
    -
    -timeit.timeit(code, setup = 'import numpy as np', number = 100)
    +
    code = '''
    +x = np.exp(3.)
    +'''
    +
    +timeit.timeit(code, setup = 'import numpy as np', number = 100)
    -
    7.366575300693512e-05
    +
    7.406435906887054e-05

    That reports the total time for the 100 replications.

    We can run it from the command line.

    -
    python -m timeit -s 'import numpy' -n 1000 'x = numpy.exp(3.)'
    +
    python -m timeit -s 'import numpy' -n 1000 'x = numpy.exp(3.)'
    -
    1000 loops, best of 5: 549 nsec per loop
    +
    :0: UserWarning: The test results are likely unreliable. The worst time (2.67 usec) was more than four times slower than the best time (623 nsec).
    +1000 loops, best of 5: 623 nsec per loop

    timeit ran the code 1000 times for 5 different repetitions, giving the average time for the 1000 samples for the best of the 5 repetitions.

    @@ -3208,35 +3218,35 @@

    Profiling

    The Cprofile module will show you how much time is spent in different functions, which can help you pinpoint bottlenecks in your code.

    I haven’t run this code when producing this document as the output of the profiling can be lengthy.

    -
    def lr_slow(y, x):
    -    xtx = x.T @ x
    -    xty = x.T @ y
    -    inv = np.linalg.inv(xtx)
    -    return inv @ xty
    -
    -## generate random observations and random matrix of predictors
    -y = np.random.normal(size = 5000)
    -x = np.random.normal(size = (5000,1000))
    -
    -import cProfile
    -
    -cProfile.run('lr_slow(y,x)')
    +
    def lr_slow(y, x):
    +    xtx = x.T @ x
    +    xty = x.T @ y
    +    inv = np.linalg.inv(xtx)
    +    return inv @ xty
    +
    +## generate random observations and random matrix of predictors
    +y = np.random.normal(size = 5000)
    +x = np.random.normal(size = (5000,1000))
    +
    +import cProfile
    +
    +cProfile.run('lr_slow(y,x)')

    The cumtime column includes the time spent in subfunction calls while the tottime column excludes it.

    As we’ll discuss in detail in Unit 10, we almost never want to explicitly invert a matrix. Instead we factorize the matrix and use the factorized result to do the computation of interest. In this case using the Cholesky decomposition is a standard approach, followed by solving triangular systems of equations.

    -
    import scipy as sp
    -
    -def lr_fast(y, x):
    -    xtx = x.T @ x
    -    xty = x.T @ y
    -    L = sp.linalg.cholesky(xtx)
    -    out = sp.linalg.solve_triangular(L.T, 
    -          sp.linalg.solve_triangular(L, xty, lower=True),
    -          lower=False)
    -    return(out)
    -
    -cProfile.run('lr_fast(y,x)')
    +
    import scipy as sp
    +
    +def lr_fast(y, x):
    +    xtx = x.T @ x
    +    xty = x.T @ y
    +    L = sp.linalg.cholesky(xtx)
    +    out = sp.linalg.solve_triangular(L.T, 
    +          sp.linalg.solve_triangular(L, xty, lower=True),
    +          lower=False)
    +    return(out)
    +
    +cProfile.run('lr_fast(y,x)')

    The Cholesky now dominates the computational time (but is much faster than inv), so there’s not much more we can do in this case.

    You might wonder if it’s better to use x.T or np.transpose(x). Try using timeit to decide.

    @@ -3257,88 +3267,88 @@

    Writing effi

    Pre-allocating memory

    Let’s consider whether we should pre-allocate space for the output of an operation or if it’s ok to keep extending the length of an array or list.

    -
    n = 100000
    -z = np.random.normal(size = n)
    -
    -## Pre-allocation
    -
    -def fun_prealloc(vals):
    -   n = len(vals)
    -   x = [0] * n
    -   for i in range(n):
    -       x[i] = np.exp(vals[i])
    -   return(x)
    -
    -## Appending to a list
    -
    -def fun_append(vals):
    -   x = []
    -   for i in range(n):
    -       x.append(np.exp(vals[i]))
    -   return(x)
    -
    -## Appending to a numpy array
    -
    -def fun_append_np(vals):
    -   x = np.array([])
    -   for i in range(n):
    -       x = np.append(x, np.exp(vals[i]))
    -   return(x)
    -
    -
    -t0 = time.time()
    -out1 = fun_prealloc(z)
    -time.time() - t0
    -
    -
    0.07567524909973145
    -
    -
    t0 = time.time()
    -out2 = fun_append(z)
    -time.time() - t0
    -
    -
    0.07505297660827637
    -
    -
    t0 = time.time()
    -out3 = fun_append_np(z)
    -time.time() - t0
    -
    -
    2.527529239654541
    +
    n = 100000
    +z = np.random.normal(size = n)
    +
    +## Pre-allocation
    +
    +def fun_prealloc(vals):
    +   n = len(vals)
    +   x = [0] * n
    +   for i in range(n):
    +       x[i] = np.exp(vals[i])
    +   return(x)
    +
    +## Appending to a list
    +
    +def fun_append(vals):
    +   x = []
    +   for i in range(n):
    +       x.append(np.exp(vals[i]))
    +   return(x)
    +
    +## Appending to a numpy array
    +
    +def fun_append_np(vals):
    +   x = np.array([])
    +   for i in range(n):
    +       x = np.append(x, np.exp(vals[i]))
    +   return(x)
    +
    +
    +t0 = time.time()
    +out1 = fun_prealloc(z)
    +time.time() - t0
    +
    +
    0.07958865165710449
    +
    +
    t0 = time.time()
    +out2 = fun_append(z)
    +time.time() - t0
    +
    +
    0.07781720161437988
    +
    +
    t0 = time.time()
    +out3 = fun_append_np(z)
    +time.time() - t0
    +
    +
    2.6464624404907227

    So what’s going on? First let’s consider what is happening with the use of np.append. Note that it is a function, rather than a method, and we need to reassign to x. What must be happening in terms of memory use and copying when we append an element?

    -
    x = np.random.normal(size = 5)
    -id(x)
    +
    x = np.random.normal(size = 5)
    +id(x)
    -
    140311151335472
    +
    140457191757872
    -
    id(np.append(x, 3.34))
    +
    id(np.append(x, 3.34))
    -
    140311151335568
    +
    140457191757968

    We can avoid that large cost of copying and memory allocation by pre-allocating space for the entire output array. (This is equivalent to variable initialization in compiled languages.)

    Ok, but how is it that we can append to the list at apparently no cost?

    It’s not magic, just that Python is clever. Let’s get an idea of what is going on:

    -
    def fun_append2(vals):
    -   n = len(vals)
    -   x = []
    -   print(f"Initial id: {id(x)}")
    -   sz = sys.getsizeof(x)
    -   print(f"iteration 0: size {sz}")
    -   for i in range(n):
    -       x.append(np.exp(vals[i]))
    -       if sys.getsizeof(x) != sz:
    -           sz = sys.getsizeof(x)
    -           print(f"iteration {i}: size {sz}")
    -   print(f"Final id: {id(x)}")
    -   return(x)
    -
    -z = np.random.normal(size = 1000)
    -out = fun_append2(z)
    -
    -
    Initial id: 140311008513600
    +
    def fun_append2(vals):
    +   n = len(vals)
    +   x = []
    +   print(f"Initial id: {id(x)}")
    +   sz = sys.getsizeof(x)
    +   print(f"iteration 0: size {sz}")
    +   for i in range(n):
    +       x.append(np.exp(vals[i]))
    +       if sys.getsizeof(x) != sz:
    +           sz = sys.getsizeof(x)
    +           print(f"iteration {i}: size {sz}")
    +   print(f"Final id: {id(x)}")
    +   return(x)
    +
    +z = np.random.normal(size = 1000)
    +out = fun_append2(z)
    +
    +
    Initial id: 140457183986560
     iteration 0: size 56
     iteration 0: size 88
     iteration 4: size 120
    @@ -3368,18 +3378,18 @@ 

    Pre-allocating memor iteration 760: size 6936 iteration 860: size 7832 iteration 972: size 8856 -Final id: 140311008513600

    +Final id: 140457183986560

    Surprisingly, the id of x doesn’t seem to change, even though we are allocating new memory at many of the iterations. I haven’t looked into this fully, but I believe that what is happening is that x is an wrapper object that contains within it a reference to an array of pointers to the list elements. The location of the wrapper object doesn’t change, but the underlying array of pointers is being reallocated.

    Side note: our assessment of size above does not include the actual size of the list elements.

    -
    print(sys.getsizeof(out))
    +
    print(sys.getsizeof(out))
    8856
    -
    out[2] = np.random.normal(size = 100000)
    -print(sys.getsizeof(out))
    +
    out[2] = np.random.normal(size = 100000)
    +print(sys.getsizeof(out))
    8856
    @@ -3390,21 +3400,21 @@

    Pre-allocating memor

    Vectorization and use of fast matrix algebra

    One key way to write efficient Python code is to take advantage of numpy’s vectorized operations.

    -
    n = 10**6
    -z = np.random.normal(size = n)
    -t0 = time.time()
    -x = np.exp(z)
    -print(time.time() - t0)
    +
    n = 10**6
    +z = np.random.normal(size = n)
    +t0 = time.time()
    +x = np.exp(z)
    +print(time.time() - t0)
    0.03270316123962402
    -
    x = np.zeros(n)  # Leave out pre-allocation timing to focus on computation.
    -t0 = time.time()
    -for i in range(n):
    -    x[i] = np.exp(z[i])
    -
    -
    -print(time.time() - t0)
    +
    x = np.zeros(n)  # Leave out pre-allocation timing to focus on computation.
    +t0 = time.time()
    +for i in range(n):
    +    x[i] = np.exp(z[i])
    +
    +
    +print(time.time() - t0)
    0.808849573135376
    @@ -3414,27 +3424,27 @@

    -
    ## On an SCF machine:
    -cat /usr/local/linux/mambaforge-3.11/lib/python3.11/site-packages/scipy/linalg/_basic.py
    +
    ## On an SCF machine:
    +cat /usr/local/linux/mambaforge-3.11/lib/python3.11/site-packages/scipy/linalg/_basic.py

    With a bit more digging around we could verify that trtrs is a LAPACK funcion by doing some grepping:

    ./linalg/_basic.py:    trtrs, = get_lapack_funcs(('trtrs',), (a1, b1))

    Many numpy and scipy functions allow you to pass in arrays, and operate on those arrays in vectorized fashion. So before writing a for loop, look at the help information on the relevant function(s) to see if they operate in a vectorized fashion. Functions might take arrays for one or more of their arguments.

    Outside of the numerical packages, we often have to manually do the looping:

    -
    x = [3.5, 2.7, 4.6]
    -try:
    -    math.cos(x)
    -except Exception as error:
    -    print(error)
    +
    x = [3.5, 2.7, 4.6]
    +try:
    +    math.cos(x)
    +except Exception as error:
    +    print(error)
    must be real number, not list
    -
    [math.cos(val) for val in x]
    +
    [math.cos(val) for val in x]
    [-0.9364566872907963, -0.9040721420170612, -0.11215252693505487]
    -
    list(map(math.cos, x))
    +
    list(map(math.cos, x))
    [-0.9364566872907963, -0.9040721420170612, -0.11215252693505487]
    @@ -3446,24 +3456,24 @@

    \(y_{i\cdot}=\sum_{j}y_{ij}\) and \(y_{\cdot j} = \sum_{i} y_{ij}\) and \(y_{\cdot\cdot} = \sum_{i} \sum_{j} y_{ij}\). Write this in a vectorized way without any loops. Note that ‘vectorized’ calculations also work with matrices and arrays.

    Sometimes we can exploit vectorized mathematical operations in surprising ways, though sometimes the code is uglier.

    -
    x = np.random.normal(size = n)
    -
    -## List comprehension
    -timeit.timeit('truncx = [max(0,val) for val in x]', number = 10, globals = {'x':x})
    +
    x = np.random.normal(size = n)
    +
    +## List comprehension
    +timeit.timeit('truncx = [max(0,val) for val in x]', number = 10, globals = {'x':x})
    1.7915509291924536
    -
    ## Vectorized slice replacement
    -timeit.timeit('truncx = x; truncx[x < 0] = 0', number = 10, globals = {'x':x})
    +
    ## Vectorized slice replacement
    +timeit.timeit('truncx = x; truncx[x < 0] = 0', number = 10, globals = {'x':x})
    0.009905248880386353
    -
    ## Vectorized math trick
    -timeit.timeit('truncx = x * x>0', number = 10, globals = {'x':x})
    +
    ## Vectorized math trick
    +timeit.timeit('truncx = x * x>0', number = 10, globals = {'x':x})
    0.017728352919220924
    @@ -3472,7 +3482,7 @@

    more complicated (see more in Section 4.6 on the cache), such that it may not matter for numpy calculations.

  • +
  • In general, in Python looping over rows is likely to be faster than looping over columns because of numpy’s row-major ordering (by default, matrices are stored in memory as a long array in which values in a row are adjacent to each other). However how numpy handles this is more complicated (see more in the Section on cache-aware programming), such that it may not matter for numpy calculations.
  • You can use direct arithmetic operations to add/subtract/multiply/divide a vector by each column of a matrix, e.g. A*b does element-wise multiplication of each column of A by a vector b. If you need to operate by row, you can do it by transposing the matrix.
  • Caution: relying on Python’s broadcasting rule in the context of vectorized operations, such as is done when direct-multiplying a matrix by a vector to scale the columns relative to each other, can be dangerous as the code may not be easy for someone to read and poses greater dangers of bugs. In some cases you may want to first write the code more directly and then compare the more efficient code to make sure the results are the same. It’s also a good idea to comment your code in such cases.

    @@ -3482,72 +3492,72 @@

    Are m

    The potential for inefficiency of looping and map operations in interpreted languages will depend in part on whether a substantial part of the work is in the overhead involved in the looping or in the time required by the function evaluation on each of the elements. If you’re worried about speed, it’s a good idea to benchmark map or pandas.apply against looping.

    Here’s an example where the bulk of time is in the actual computation and not in the looping itself. Here map is not faster than a loop.

    -
    import time
    -import statsmodels.api as sm
    -
    -n = 500000;
    -nr = 10000
    -nCalcs = int(n/nr)
    -
    -mat = np.random.normal(size = (nr, nCalcs))
    -
    -X = list(range(nr))
    -X = sm.add_constant(X)
    -
    -def regrFun(i):
    -    model = sm.OLS(mat[:,i], X)
    -    return(model.fit().params[1])
    -
    -t0 = time.time()
    -out1 = list(map(regrFun, range(nCalcs)))
    -time.time() - t0
    -
    -
    0.06007575988769531
    -
    -
    t0 = time.time()
    -out2 = np.zeros(nCalcs)
    -for i in range(nCalcs):
    -    out2[i] = regrFun(i)
    -
    -
    -time.time() - t0
    -
    -
    0.06388640403747559
    +
    import time
    +import statsmodels.api as sm
    +
    +n = 500000;
    +nr = 10000
    +nCalcs = int(n/nr)
    +
    +mat = np.random.normal(size = (nr, nCalcs))
    +
    +X = list(range(nr))
    +X = sm.add_constant(X)
    +
    +def regrFun(i):
    +    model = sm.OLS(mat[:,i], X)
    +    return(model.fit().params[1])
    +
    +t0 = time.time()
    +out1 = list(map(regrFun, range(nCalcs)))
    +time.time() - t0
    +
    +
    0.06300592422485352
    +
    +
    t0 = time.time()
    +out2 = np.zeros(nCalcs)
    +for i in range(nCalcs):
    +    out2[i] = regrFun(i)
    +
    +
    +time.time() - t0
    +
    +
    0.06177210807800293

    And here’s an example, where (unlike the previous example) the core computation is very fast, so we might expect the overhead of looping (in its various forms seen here) to be important.

    -
    import time
    -n = 10**6
    -x = np.random.normal(size = n)
    -
    -t0 = time.time()
    -out = np.exp(x)
    -time.time() - t0
    +
    import time
    +n = 10**6
    +x = np.random.normal(size = n)
    +
    +t0 = time.time()
    +out = np.exp(x)
    +time.time() - t0
    -
    0.014000177383422852
    +
    0.013067245483398438
    -
    t0 = time.time()
    -vals = np.zeros(n)
    -for i in range(n):
    -    vals[i] = np.exp(x[i])
    -
    -
    -time.time() - t0
    +
    t0 = time.time()
    +vals = np.zeros(n)
    +for i in range(n):
    +    vals[i] = np.exp(x[i])
    +
    +
    +time.time() - t0
    -
    0.842008113861084
    +
    0.8266475200653076
    -
    t0 = time.time()
    -vals = [np.exp(v) for v in x]
    -time.time() - t0
    +
    t0 = time.time()
    +vals = [np.exp(v) for v in x]
    +time.time() - t0
    -
    0.7019450664520264
    +
    0.6520075798034668
    -
    t0 = time.time()
    -vals = list(map(np.exp, x))
    -time.time() - t0
    +
    t0 = time.time()
    +vals = list(map(np.exp, x))
    +time.time() - t0
    -
    0.6085102558135986
    +
    0.5861358642578125

    In fact, it looks like we can’t avoid the overhead unless we use the vectorized call, which is of course the recommended approach in this case, both for speed and readability (and conciseness).

    @@ -3557,15 +3567,15 @@

    Matrix algebra e

    Often calculations that are not explicitly linear algebra calculations can be done as matrix algebra. If our Python installation has a fast (and possibly parallelized) BLAS, this allows our calculation to take advantage of it.

    For example, we can sum the rows of a matrix by multiplying by a vector of ones.

    -
    mat = np.random.normal(size=(500,500))
    -
    -timeit.timeit('mat.dot(np.ones(500))', setup = 'import numpy as np', number = 1000, globals = {'mat': mat})
    +
    mat = np.random.normal(size=(500,500))
    +
    +timeit.timeit('mat.dot(np.ones(500))', setup = 'import numpy as np', number = 1000, globals = {'mat': mat})
    -
    0.0460930522531271
    +
    0.03959464281797409
    -
    timeit.timeit('np.sum(mat, axis = 1)', setup = 'import numpy as np', number = 1000, globals = {'mat': mat})
    +
    timeit.timeit('np.sum(mat, axis = 1)', setup = 'import numpy as np', number = 1000, globals = {'mat': mat})
    -
    0.12684659287333488
    +
    0.12120177410542965

    Given the extra computation involved in actually multiplying each number by one, it’s surprising that this is faster than numpy sum function. One thing we’d want to know is whether the BLAS matrix multiplication call is being done in parallel.

    @@ -3576,20 +3586,20 @@

    Matrix algebra e

    Order of operations and efficiency

    When doing matrix algebra, the order in which you do operations can be critical for efficiency. How should I order the following calculation?

    -
    n = 5000
    -A = np.random.normal(size=(n, n))
    -B = np.random.normal(size=(n, n))
    -x = np.random.normal(size=n)
    -
    -t0 = time.time()
    -res1 = (A @ B) @ x
    -print(time.time() - t0)
    +
    n = 5000
    +A = np.random.normal(size=(n, n))
    +B = np.random.normal(size=(n, n))
    +x = np.random.normal(size=n)
    +
    +t0 = time.time()
    +res1 = (A @ B) @ x
    +print(time.time() - t0)
    1.7937490940093994
    -
    t0 = time.time()
    -res1 = A @ (B @ x)
    -print(time.time() - t0)
    +
    t0 = time.time()
    +res1 = A @ (B @ x)
    +print(time.time() - t0)
    0.08377599716186523
    @@ -3601,15 +3611,15 @@

    Avoiding u

    We can use the matrix direct product (i.e., A*B) to do some manipulations much more quickly than using matrix multiplication. Challenge: How can I use the direct product to find the trace of a matrix, \(XY\)?

    Finally, when working with diagonal matrices, you can generally get much faster results by being smart. The following operations: \(X+D\), \(DX\), \(XD\) are mathematically the sum of two matrices and products of two matrices. But we can do the computation without using two full matrices. Challenge: How?

    -
    n = 1000
    -X = np.random.normal(size=(n, n))
    -diagvals = np.random.normal(size=n)
    -D = np.diag(diagvals)
    -
    -# The following lines are very inefficient
    -summedMat = X + D
    -prodMat1 = D @ X
    -prodMat2 = X @ D
    +
    n = 1000
    +X = np.random.normal(size=(n, n))
    +diagvals = np.random.normal(size=n)
    +D = np.diag(diagvals)
    +
    +# The following lines are very inefficient
    +summedMat = X + D
    +prodMat1 = D @ X
    +prodMat2 = X @ D

    More generally, sparse matrices and structured matrices (such as block diagonal matrices) can generally be worked with MUCH more efficiently than treating them as arbitrary matrices. The scipy.sparse package (for both structured and arbitrary sparse matrices) can help, as can specialized code available in other languages, such as C and Fortran packages.

    @@ -3618,25 +3628,25 @@

    Speed of lookup

    There are lots of situations in which we need to retrieve values for subsequent computations. In some cases we might be retrieving elements of an array or looking up values in a dictionary.

    Let’s compare the speed of some different approaches to lookup.

    -
    n = 1000
    -x = list(np.random.normal(size = n))
    -keys = [str(v) for v in range(n)]
    -xD = dict(zip(keys, x))
    -
    -timeit.timeit("x[500]", number = 10**6, globals = {'x':x})
    +
    n = 1000
    +x = list(np.random.normal(size = n))
    +keys = [str(v) for v in range(n)]
    +xD = dict(zip(keys, x))
    +
    +timeit.timeit("x[500]", number = 10**6, globals = {'x':x})
    -
    0.015508504584431648
    +
    0.016299044713377953
    -
    timeit.timeit("xD['500']", number=10**6, globals = {'xD':xD})
    +
    timeit.timeit("xD['500']", number=10**6, globals = {'xD':xD})
    -
    0.029696810990571976
    +
    0.030375661328434944

    How is it that Python can look up by key in the dictionary at essentially the same speed as jumping to an index position? It uses hashing, which allows O(1) lookup. In contrast, if one has to look through each key in turn, that is O(n), which is much slower:

    -
    timeit.timeit("x[keys.index('500')]", number = 10**6, globals = {'x':x, 'keys':keys})
    +
    timeit.timeit("x[keys.index('500')]", number = 10**6, globals = {'x':x, 'keys':keys})
    -
    6.116314208135009
    +
    6.1827212162315845

    As a further point of contrast, if we look up elements by name in R in named vectors or lists, that is much slower than looking up by index, because R doesn’t use hashing in that context and has to scan through the objects one by one until it finds the one with the name it is looking for. This stands in contrast to R and Python being able to directly go to the position of interest based on the index of an array, or to the hash-based lookup in a Python dictionary or an R environment.

    @@ -3654,14 +3664,14 @@

    Cache-aware progra

    If you know the size of the cache, you can try to design your code so that in a given part of your code you access data structures that will fit in the cache. This sort of thing is generally more relevant if you’re coding in a language like C. But it can matter sometimes in interpreted languages too.

    Let’s see what happens in Python.

    -
    nr = 800000
    -nc = 100
    -
    -A = np.random.normal(size=(nr, nc))
    -tA = np.random.normal(size=(nc, nr))  
    -
    -## Verify that A is row-major using `.flags` (notice the `C_CONTIGUOUS: True`).
    -A.flags
    +
    nr = 800000
    +nc = 100
    +
    +A = np.random.normal(size=(nr, nc))
    +tA = np.random.normal(size=(nc, nr))  
    +
    +## Verify that A is row-major using `.flags` (notice the `C_CONTIGUOUS: True`).
    +A.flags
      C_CONTIGUOUS : True
       F_CONTIGUOUS : False
    @@ -3674,33 +3684,33 @@ 

    Cache-aware progra

    Note that I didn’t use A.T or np.transpose as that doesn’t make a copy in memory and so the transposed matrix doesn’t end up being row-major. You can use A.flags and A.T.flags` to see this.

    Now let’s time things.

    -
    # Define the mean calculations as functions
    -def mean_by_column():
    -    return np.mean(A, axis=0)
    -
    -def mean_by_row():
    -    return np.mean(tA, axis=1)
    -
    -timeit.timeit(mean_by_column, number=10)
    +
    # Define the mean calculations as functions
    +def mean_by_column():
    +    return np.mean(A, axis=0)
    +
    +def mean_by_row():
    +    return np.mean(tA, axis=1)
    +
    +timeit.timeit(mean_by_column, number=10)
    -
    0.5135188214480877
    +
    0.5146233681589365
    -
    timeit.timeit(mean_by_row, number=10)
    +
    timeit.timeit(mean_by_row, number=10)
    -
    0.4520120248198509
    +
    0.4034226518124342

    Suppose we instead do the looping manually.

    -
    timeit.timeit('[np.mean(A[:,col]) for col in range(A.shape[1])]',
    -    setup = 'import numpy as np', number=10, globals = {'A': A})   # 5.10 sec.
    +
    timeit.timeit('[np.mean(A[:,col]) for col in range(A.shape[1])]',
    +    setup = 'import numpy as np', number=10, globals = {'A': A})   # 5.10 sec.
    -
    5.4370351154357195
    +
    5.463273551315069
    -
    timeit.timeit('[np.mean(tA[row,:]) for row in range(tA.shape[0])]',
    -    setup = 'import numpy as np', number=10, globals = {'tA': tA})  # 5.06 sec.
    +
    timeit.timeit('[np.mean(tA[row,:]) for row in range(tA.shape[0])]',
    +    setup = 'import numpy as np', number=10, globals = {'tA': tA})  # 5.06 sec.
    -
    0.4240651857107878
    +
    0.42474850825965405

    Indeed, the row-wise calculations are faster, particularly if we do it manually. So that suggests numpy might be doing something clever in its implementation of mean with the axis argument.

    @@ -3713,7 +3723,7 @@

    Cache-aware progra

    Loop fusion

    Let’s consider this (vectorized) code:

    -
    x = np.exp(x) + 3*np.sin(x)
    +
    x = np.exp(x) + 3*np.sin(x)

    This code has some downsides.

      @@ -3722,9 +3732,9 @@

      Loop fusion

    Contrast that to running directly as a for loop (e.g., here in Julia or in C/C++):

    -
    for i in 1:length(x)
    -    x[i] = exp(x[i]) + 3*sin(x[i])
    -end
    +
    for i in 1:length(x)
    +    x[i] = exp(x[i]) + 3*sin(x[i])
    +end

    How does that affect the downsides mentioned above?

    Combining loops is called ‘fusing’ and is an important optimization that Julia can do, as shown in this demo. It’s also a key optimization done by XLA, a compiler used with Tensorflow, so one approach to getting loop fusion in Python is to use Tensorflow for such calculations within Python rather than simply using numpy.

    @@ -3745,13 +3755,13 @@

    Hashing (inc

    Lazy evaluation

    What’s strange about this R code?

    -
    f <- function(x) print("hi")
    -system.time(mean(rnorm(1000000)))
    +
    f <- function(x) print("hi")
    +system.time(mean(rnorm(1000000)))
       user  system elapsed 
    -  0.057   0.000   0.057 
    + 0.06 0.00 0.06
    -
    system.time(f(3))
    +
    system.time(f(3))
    [1] "hi"
    @@ -3759,7 +3769,7 @@

    Lazy evaluation

       user  system elapsed 
           0       0       0 
    -
    system.time(f(mean(rnorm(1000000)))) 
    +
    system.time(f(mean(rnorm(1000000)))) 
    [1] "hi"
    diff --git a/units/unit5-programming.pdf b/units/unit5-programming.pdf index 1adbaac..3ac1b48 100644 Binary files a/units/unit5-programming.pdf and b/units/unit5-programming.pdf differ diff --git a/units/unit5-programming_files/figure-pdf/unnamed-chunk-52-1.pdf b/units/unit5-programming_files/figure-pdf/unnamed-chunk-52-1.pdf index c16ac20..2077eae 100644 Binary files a/units/unit5-programming_files/figure-pdf/unnamed-chunk-52-1.pdf and b/units/unit5-programming_files/figure-pdf/unnamed-chunk-52-1.pdf differ diff --git a/units/unit5-programming_files/figure-pdf/unnamed-chunk-53-3.pdf b/units/unit5-programming_files/figure-pdf/unnamed-chunk-53-3.pdf index ce0e0fb..1789072 100644 Binary files a/units/unit5-programming_files/figure-pdf/unnamed-chunk-53-3.pdf and b/units/unit5-programming_files/figure-pdf/unnamed-chunk-53-3.pdf differ diff --git a/units/unit6-parallel.pdf b/units/unit6-parallel.pdf index b7e3191..eb6bd74 100644 Binary files a/units/unit6-parallel.pdf and b/units/unit6-parallel.pdf differ diff --git a/units/unit7-bigData.pdf b/units/unit7-bigData.pdf index 6b0f4f2..20f1c41 100644 Binary files a/units/unit7-bigData.pdf and b/units/unit7-bigData.pdf differ diff --git a/units/unit8-numbers.pdf b/units/unit8-numbers.pdf index 3a42f3b..2d71303 100644 Binary files a/units/unit8-numbers.pdf and b/units/unit8-numbers.pdf differ diff --git a/units/unit9-sim.pdf b/units/unit9-sim.pdf index 5aaa140..8ad4f10 100644 Binary files a/units/unit9-sim.pdf and b/units/unit9-sim.pdf differ