From 5ee5c40402c92a498ed8d6eeb6cf0b5c1680817b Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 7 Jun 2024 00:23:59 -0500 Subject: [PATCH 01/91] arm64 pmap: Defer bti lookup Defer the bti lookup until after page table page allocation is complete. We sometimes release the pmap lock and sleep during page table page allocation. Consequently, the result of a bti lookup from before page table page allocation could be stale when we finally create the mapping based on it. Modify pmap_bti_same() to update the prototype PTE at the same time as checking the address range. This eliminates the need for calling pmap_pte_bti() in addition to pmap_bti_same(). pmap_bti_same() was already doing most of the work of pmap_pte_bti(). Reviewed by: markj Differential Revision: https://reviews.freebsd.org/D45502 --- sys/arm64/arm64/pmap.c | 73 +++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index 92c1c824ba4e..7b30b2a6ae37 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -508,7 +508,8 @@ static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static uma_zone_t pmap_bti_ranges_zone; -static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); +static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + pt_entry_t *pte); static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va); static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static void *bti_dup_range(void *ctx, void *data); @@ -4955,21 +4956,22 @@ pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va) #endif /* VM_NRESERVLEVEL > 0 */ static int -pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, +pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags, int psind) { - pd_entry_t *l0p, *l1p, *l2p, origpte; + pd_entry_t *l0p, *l1p, *l2p, newpte, origpte; vm_page_t mp; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(psind > 0 && psind < MAXPAGESIZES, ("psind %d unexpected", psind)); - KASSERT((PTE_TO_PHYS(newpte) & (pagesizes[psind] - 1)) == 0, - ("unaligned phys address %#lx newpte %#lx psind %d", - PTE_TO_PHYS(newpte), newpte, psind)); + KASSERT((PTE_TO_PHYS(pte) & (pagesizes[psind] - 1)) == 0, + ("unaligned phys address %#lx pte %#lx psind %d", + PTE_TO_PHYS(pte), pte, psind)); restart: - if (!pmap_bti_same(pmap, va, va + pagesizes[psind])) + newpte = pte; + if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte)) return (KERN_PROTECTION_FAILURE); if (psind == 2) { PMAP_ASSERT_L1_BLOCKS_SUPPORTED; @@ -5123,9 +5125,6 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, lock = NULL; PMAP_LOCK(pmap); - /* Wait until we lock the pmap to protect the bti rangeset */ - new_l3 |= pmap_pte_bti(pmap, va); - if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed largepage va %#lx flags %#x", va, flags)); @@ -5197,6 +5196,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, orig_l3 = pmap_load(l3); opa = PTE_TO_PHYS(orig_l3); pv = NULL; + new_l3 |= pmap_pte_bti(pmap, va); /* * Is the specified virtual address already mapped? @@ -5405,7 +5405,6 @@ pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L2_BLOCK); - new_l2 |= pmap_pte_bti(pmap, va); if ((m->oflags & VPO_UNMANAGED) == 0) { new_l2 |= ATTR_SW_MANAGED; new_l2 &= ~ATTR_AF; @@ -5478,7 +5477,7 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, * and let vm_fault() cope. Check after l2 allocation, since * it could sleep. */ - if (!pmap_bti_same(pmap, va, va + L2_SIZE)) { + if (!pmap_bti_same(pmap, va, va + L2_SIZE, &new_l2)) { KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP")); pmap_abort_ptp(pmap, va, l2pg); return (KERN_PROTECTION_FAILURE); @@ -5633,7 +5632,6 @@ pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p, l3e = VM_PAGE_TO_PTE(m) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_CONTIGUOUS | L3_PAGE; - l3e |= pmap_pte_bti(pmap, va); if ((m->oflags & VPO_UNMANAGED) == 0) { l3e |= ATTR_SW_MANAGED; l3e &= ~ATTR_AF; @@ -5733,19 +5731,6 @@ pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags, } } l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p)); - -have_l3p: - /* - * If bti is not the same for the whole L3C range, return - * failure and let vm_fault() cope. Check after L3 allocation, - * since it could sleep. - */ - if (!pmap_bti_same(pmap, va, va + L3C_SIZE)) { - (*ml3p)->ref_count -= L3C_ENTRIES - 1; - pmap_abort_ptp(pmap, va, *ml3p); - *ml3p = NULL; - return (KERN_PROTECTION_FAILURE); - } } else { *ml3p = NULL; @@ -5768,8 +5753,22 @@ pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags, pmap_load(pde))); } } +have_l3p: l3p = &l3p[pmap_l3_index(va)]; + /* + * If bti is not the same for the whole L3C range, return failure + * and let vm_fault() cope. Check after L3 allocation, since + * it could sleep. + */ + if (!pmap_bti_same(pmap, va, va + L3C_SIZE, &l3e)) { + KASSERT(*ml3p != NULL, ("pmap_enter_l3c: missing L3 PTP")); + (*ml3p)->ref_count -= L3C_ENTRIES - 1; + pmap_abort_ptp(pmap, va, *ml3p); + *ml3p = NULL; + return (KERN_PROTECTION_FAILURE); + } + /* * If there are existing mappings, either abort or remove them. */ @@ -9271,8 +9270,16 @@ pmap_bti_deassign_all(pmap_t pmap) rangeset_remove_all(pmap->pm_bti); } +/* + * Returns true if the BTI setting is the same across the specified address + * range, and false otherwise. When returning true, updates the referenced PTE + * to reflect the BTI setting. + * + * Only stage 1 pmaps support BTI. The kernel pmap is always a stage 1 pmap + * that has the same BTI setting implicitly across its entire address range. + */ static bool -pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte) { struct rs_el *next_rs, *rs; vm_offset_t va; @@ -9282,10 +9289,16 @@ pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) ("%s: Start address not in canonical form: %lx", __func__, sva)); KASSERT(ADDR_IS_CANONICAL(eva), ("%s: End address not in canonical form: %lx", __func__, eva)); + KASSERT((*pte & ATTR_S1_GP) == 0, + ("%s: pte %lx has ATTR_S1_GP preset", __func__, *pte)); - if (pmap->pm_bti == NULL || ADDR_IS_KERNEL(sva)) + if (pmap == kernel_pmap) { + *pte |= ATTR_KERN_GP; + return (true); + } + if (pmap->pm_bti == NULL) return (true); - MPASS(!ADDR_IS_KERNEL(eva)); + PMAP_ASSERT_STAGE1(pmap); rs = rangeset_lookup(pmap->pm_bti, sva); if (rs == NULL) { rs = rangeset_next(pmap->pm_bti, sva); @@ -9299,6 +9312,8 @@ pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) return (false); rs = next_rs; } + if (rs != NULL) + *pte |= ATTR_S1_GP; return (true); } From 888796ade2842486d3167067e8034254c38aadd3 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Tue, 19 Mar 2024 10:31:39 -0400 Subject: [PATCH 02/91] libm: fma: correct zero sign with small inputs PR: 277783 Reported by: Victor Stinner Submitted by: kargl MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D44433 --- lib/msun/src/s_fma.c | 4 +++- lib/msun/src/s_fmal.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/msun/src/s_fma.c b/lib/msun/src/s_fma.c index b8a342646d85..4d08b40cc71a 100644 --- a/lib/msun/src/s_fma.c +++ b/lib/msun/src/s_fma.c @@ -267,7 +267,9 @@ fma(double x, double y, double z) */ fesetround(oround); volatile double vzs = zs; /* XXX gcc CSE bug workaround */ - return (xy.hi + vzs + ldexp(xy.lo, spread)); + xs = ldexp(xy.lo, spread); + xy.hi += vzs; + return (xy.hi == 0 ? xs : xy.hi + xs); } if (oround != FE_TONEAREST) { diff --git a/lib/msun/src/s_fmal.c b/lib/msun/src/s_fmal.c index 3d333632127c..12f9c364670b 100644 --- a/lib/msun/src/s_fmal.c +++ b/lib/msun/src/s_fmal.c @@ -248,7 +248,9 @@ fmal(long double x, long double y, long double z) */ fesetround(oround); volatile long double vzs = zs; /* XXX gcc CSE bug workaround */ - return (xy.hi + vzs + ldexpl(xy.lo, spread)); + xs = ldexpl(xy.lo, spread); + xy.hi += vzs; + return (xy.hi == 0 ? xs : xy.hi + xs); } if (oround != FE_TONEAREST) { From 13a51233e4c7d6cff04043c38845b1ec1af38680 Mon Sep 17 00:00:00 2001 From: Rick Macklem Date: Sat, 8 Jun 2024 16:40:52 -0700 Subject: [PATCH 03/91] nfsd: Delete an unused VNET global variable During code inspection, I noticed that NFSD_VNET_DEFINE(nfsrv_dontlisthead) is unused, so delete it. MFC after: 2 weeks --- sys/fs/nfsserver/nfs_nfsdsubs.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/sys/fs/nfsserver/nfs_nfsdsubs.c b/sys/fs/nfsserver/nfs_nfsdsubs.c index 0d7e4c73fe69..d80826993f23 100644 --- a/sys/fs/nfsserver/nfs_nfsdsubs.c +++ b/sys/fs/nfsserver/nfs_nfsdsubs.c @@ -57,9 +57,6 @@ NFSD_VNET_DECLARE(int, nfs_rootfhset); NFSD_VNET_DECLARE(uid_t, nfsrv_defaultuid); NFSD_VNET_DECLARE(gid_t, nfsrv_defaultgid); -NFSD_VNET_DEFINE(struct nfsdontlisthead, nfsrv_dontlisthead); - - char nfs_v2pubfh[NFSX_V2FH]; struct nfsdontlisthead nfsrv_dontlisthead; struct nfslayouthead nfsrv_recalllisthead; From ebc2bab04823c24c524f913457d6b88dc7ea9fac Mon Sep 17 00:00:00 2001 From: Zhenlei Huang Date: Sun, 9 Jun 2024 09:05:22 +0800 Subject: [PATCH 04/91] pflog: Correctly check if bpf peers are present On creating the pflog(4) interface, pflog_clone_create() does an unconditional bpfattach(). Use bpf_peers_present() which was introduced in commit 16d878cc99ef [1] to check the presence of bpf peers. This will save a little CPU cycles when no bpf peers present. There should be no functional change. 1. 16d878cc99ef Fix the following bpf(4) race condition which can result in a panic Reviewed by: kp MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D45532 --- sys/netpfil/pf/if_pflog.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sys/netpfil/pf/if_pflog.c b/sys/netpfil/pf/if_pflog.c index 7ac337a84c5d..1e73d5f51851 100644 --- a/sys/netpfil/pf/if_pflog.c +++ b/sys/netpfil/pf/if_pflog.c @@ -223,9 +223,10 @@ pflog_packet(struct pfi_kkif *kif, struct mbuf *m, sa_family_t af, struct pfloghdr hdr; if (kif == NULL || m == NULL || rm == NULL || pd == NULL) - return ( 1); + return (1); - if ((ifn = V_pflogifs[rm->logif]) == NULL || !ifn->if_bpf) + ifn = V_pflogifs[rm->logif]; + if (ifn == NULL || !bpf_peers_present(ifn->if_bpf)) return (0); bzero(&hdr, sizeof(hdr)); @@ -274,7 +275,7 @@ pflog_packet(struct pfi_kkif *kif, struct mbuf *m, sa_family_t af, if_inc_counter(ifn, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifn, IFCOUNTER_OBYTES, m->m_pkthdr.len); - BPF_MTAP2(ifn, &hdr, PFLOG_HDRLEN, m); + bpf_mtap2(ifn->if_bpf, &hdr, PFLOG_HDRLEN, m); return (0); } From 2671bde99295d9e01d10316d0f3fb8b6d21f0f4d Mon Sep 17 00:00:00 2001 From: Zhenlei Huang Date: Sun, 9 Jun 2024 09:05:22 +0800 Subject: [PATCH 05/91] pfsync: Correctly check if bpf peers are present On creating the pfsync(4) interface, pfsync_clone_create() does an unconditional bpfattach(). Use bpf_peers_present() which was introduced in commit 16d878cc99ef [1] to check the presence of bpf peers. This will save a little CPU cycles and memory usage when the synchronisation interface is not configured and there is no bpf peers present. There should be no functional change. 1. 16d878cc99ef Fix the following bpf(4) race condition which can result in a panic Reviewed by: kp MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D45533 --- sys/netpfil/pf/if_pfsync.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c index c22a6a5982a9..80d6fddc709c 100644 --- a/sys/netpfil/pf/if_pfsync.c +++ b/sys/netpfil/pf/if_pfsync.c @@ -1796,7 +1796,7 @@ pfsync_sendout(int schedswi, int c) ("%s: sc_len %zu", __func__, b->b_len)); PFSYNC_BUCKET_LOCK_ASSERT(b); - if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) { + if (!bpf_peers_present(ifp->if_bpf) && sc->sc_sync_if == NULL) { pfsync_drop(sc); return; } @@ -1925,10 +1925,10 @@ pfsync_sendout(int schedswi, int c) V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++; /* we're done, let's put it on the wire */ - if (ifp->if_bpf) { + if (bpf_peers_present(ifp->if_bpf)) { m->m_data += aflen; m->m_len = m->m_pkthdr.len = len - aflen; - BPF_MTAP(ifp, m); + bpf_mtap(ifp->if_bpf, m); m->m_data -= aflen; m->m_len = m->m_pkthdr.len = len; } From 3ce9b2ee9404381a002316df670939a3bd3c994f Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Sun, 9 Jun 2024 09:49:49 -0400 Subject: [PATCH 06/91] ti/am335x: Fix the device_set_descf() call in dmtpps_probe() Fixes: 459dc61c8b05 ("arm: Convert drivers to use device_set_desc(f)()") --- sys/arm/ti/am335x/am335x_dmtpps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/arm/ti/am335x/am335x_dmtpps.c b/sys/arm/ti/am335x/am335x_dmtpps.c index 5a19d3ad0dc3..32f975a310de 100644 --- a/sys/arm/ti/am335x/am335x_dmtpps.c +++ b/sys/arm/ti/am335x/am335x_dmtpps.c @@ -434,7 +434,7 @@ dmtpps_probe(device_t dev) if (dmtpps_tmr_num != tmr_num) return (ENXIO); - device_set_descf("AM335x PPS-Capture DMTimer%d", tmr_num); + device_set_descf(dev, "AM335x PPS-Capture DMTimer%d", tmr_num); return(BUS_PROBE_DEFAULT); } From da925fcebf397cc3bfc74b7aa9757efd6231aa00 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Sat, 1 Jun 2024 11:30:16 -0400 Subject: [PATCH 07/91] libc/tests: Further refine the condition for installing h_raw See 8164d511d6a6 ("libc/tests: Fix installation without MK_TOOLCHAIN") for some background. Here we should really be testing MK_CLANG instead, since that's what gates compilation of libclang_rt. Fixes: 8164d511d6a6 ("libc/tests: Fix installation without MK_TOOLCHAIN") --- lib/libc/tests/ssp/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/libc/tests/ssp/Makefile b/lib/libc/tests/ssp/Makefile index 2907d1a621bf..0f5b56505134 100644 --- a/lib/libc/tests/ssp/Makefile +++ b/lib/libc/tests/ssp/Makefile @@ -30,7 +30,8 @@ PROGS+= h_memset # now on amd64 when it trips the stack bounds specified in t_ssp.sh . This # probably needs to be fixed as it's currently hardcoded. .if ${COMPILER_TYPE} == "clang" && !defined(_SKIP_BUILD) && \ - (!defined(_RECURSING_PROGS) || ${PROG} == "h_raw") && ${MK_TOOLCHAIN} == "yes" + (!defined(_RECURSING_PROGS) || ${PROG} == "h_raw") && \ + defined(MK_CLANG) && ${MK_CLANG} == "yes" .include "${SRCTOP}/lib/libclang_rt/compiler-rt-vars.mk" _libclang_rt_ubsan= ${SYSROOT}${SANITIZER_LIBDIR}/libclang_rt.ubsan_standalone-${CRTARCH}.a .if exists(${_libclang_rt_ubsan}) From bbca3a75bb412f7106a569b82c616404103be084 Mon Sep 17 00:00:00 2001 From: Christos Margiolis Date: Sun, 9 Jun 2024 17:30:22 +0200 Subject: [PATCH 08/91] sound: Include sound(4) channel information in sndstat nvlist Extend SNDST_DSPS_PROVIDER_INFO for sound(4) to include information about each channel in a given device, similar to how cat'ing /dev/sndstat with hw.snd.verbose=2 works. While here, document all provider_info fields. Sponsored by: The FreeBSD Foundation MFC after: 3 days Reviewed by: dev_submerge.ch, markj Differential Revision: https://reviews.freebsd.org/D45501 --- share/man/man4/sndstat.4 | 140 ++++++++++++++++++++++++++++++------ sys/dev/sound/pcm/sndstat.c | 115 ++++++++++++++++++++++++++++- sys/sys/sndstat.h | 37 ++++++++-- 3 files changed, 265 insertions(+), 27 deletions(-) diff --git a/share/man/man4/sndstat.4 b/share/man/man4/sndstat.4 index 8325490da162..2af0619961d8 100644 --- a/share/man/man4/sndstat.4 +++ b/share/man/man4/sndstat.4 @@ -29,7 +29,7 @@ .\" .\" Note: The date here should be updated whenever a non-trivial .\" change is made to the manual page. -.Dd April 15, 2021 +.Dd June 5, 2024 .Dt SNDSTAT 4 .Os .Sh NAME @@ -60,25 +60,55 @@ struct sndstioc_nv_arg { Here is an example of an nvlist object with explanations of the common fields: .Bd -literal -offset indent dsps (NVLIST ARRAY): 1 - from_user (BOOL): FALSE - nameunit (STRING): [pcm0] - devnode (STRING): [dsp0] - desc (STRING): [Generic (0x8086) (Analog Line-out)] - pchan (NUMBER): 1 (1) (0x1) - rchan (NUMBER): 0 (0) (0x0) - info_play (NVLIST): - min_rate (NUMBER): 48000 (48000) (0xbb80) - max_rate (NUMBER): 48000 (48000) (0xbb80) - formats (NUMBER): 16 (16) (0x10) - min_chn (NUMBER): 2 (2) (0x2) - max_chn (NUMBER): 2 (2) (0x2) - provider_info (NVLIST): - unit (NUMBER): 0 (0) (0x0) - bitperfect (BOOL): FALSE - pvchan (NUMBER): 1 (1) (0x1) - rvchan (NUMBER): 0 (0) (0x0) - provider (STRING): [sound(4)] - , + from_user (BOOL): FALSE + nameunit (STRING): [pcm0] + devnode (STRING): [dsp0] + desc (STRING): [Generic (0x8086) (Analog Line-out)] + pchan (NUMBER): 1 + rchan (NUMBER): 0 + info_play (NVLIST): + min_rate (NUMBER): 48000 + max_rate (NUMBER): 48000 + formats (NUMBER): 16 + min_chn (NUMBER): 2 + max_chn (NUMBER): 2 + provider_info (NVLIST): + unit (NUMBER): 0 + bitperfect (BOOL): FALSE + pvchan (NUMBER): 1 + rvchan (NUMBER): 0 + channel_info (NVLIST_ARRAY): 1 + name (STRING): pcm0:virtual_play:dsp0.vp0 + parentchan (STRING): pcm0:play:dsp0.p0 + unit (NUMBER): 1 + latency (NUMBER): 2 + rate (NUMBER): 48000 + format (NUMBER): 0x201000 + pid (NUMBER): 1234 + comm (STRING): mpv + interrupts (NUMBER): 0 + feedcount (NUMBER): 0 + xruns (NUMBER): 0 + left_volume (NUMBER): 45 + right_volume (NUMBER): 45 + hwbuf_fmt (NUMBER): 0x200010 + hwbuf_size (NUMBER): 0 + hwbuf_blksz (NUMBER): 0 + hwbuf_blkcnt (NUMBER): 0 + hwbuf_free (NUMBER): 0 + hwbuf_ready (NUMBER): 0 + swbuf_fmt (NUMBER): 0x201000 + swbuf_size (NUMBER): 16384 + swbuf_blksz (NUMBER): 2048 + swbuf_blkcnt (NUMBER): 8 + swbuf_free (NUMBER): 16384 + swbuf_ready (NUMBER): 0 + feederchain (STRING): + [userland -> + feeder_root(0x00201000) -> + feeder_format(0x00201000 -> 0x00200010) -> + feeder_volume(0x00200010) -> hardware] + provider (STRING): [sound(4)] .Ed .Bl -tag -width ".Dv provider_info" .It Dv from_user @@ -133,6 +163,76 @@ Provider-specific fields. This field may not exist if the PCM audio device is not provided by in-kernel interface. This field will not exist if the provider field is an empty string. +For the +.Xr sound 4 +provider, there are a number of name/value pairs inside this field: +.Bl -tag -width ".Dv channel_info" +.It Dv unit +Sound card unit. +.It Dv bitperfect +Whether the sound card has bit-perfect mode enabled. +.It Dv pvchan +Number of playback virtual channels. +.It Dv rvchan +Number of recording virtual channels. +.It Dv channel_info +Channel information. +There are a number of name/value pairs inside this field: +.Bl -tag -width ".Dv hwbuf_blkcnt" +.It Dv name +Channel name. +.It Dv parenchan +Parent channel name (e.g., in the case of virtual channels). +.It Dv unit +Channel unit. +.It Dv latency +Latency. +.It Dv rate +Sampling rate. +.It Dv format +Sampling format. +.It Dv pid +PID of the process consuming the channel. +.It Dv comm +Name of the process consuming the channel. +.It Dv interrupts +Number of interrupts since the channel has been opened. +.It Dv xruns +Number of overruns/underruns, depending on channel direction. +.It Dv feedcount +Number of read/written bytes since the channel has been opened. +.It Dv left_volume +Left volume. +.It Dv right_volume +Right volume. +.It Dv hwbuf_format +Hardware buffer format. +.It Dv hwbuf_size +Hardware buffer size. +.It Dv hwbuf_blksz +Hardware buffer block size. +.It Dv hwbuf_blkcnt +Hardware buffer block count. +.It Dv hwbuf_free +Free space in hardware buffer (in bytes). +.It Dv hwbuf_ready +Number of bytes ready to be read/written from hardware buffer. +.It Dv swbuf_format +Software buffer format. +.It Dv swbuf_size +Software buffer size. +.It Dv swbuf_blksz +Software buffer block size. +.It Dv swbuf_blkcnt +Software buffer block count. +.It Dv swbuf_free +Free space in software buffer (in bytes). +.It Dv swbuf_ready +Number of bytes ready to be read/written from software buffer. +.It Dv feederchain +Channel feeder chain. +.El +.El .It Dv provider A string specifying the provider of the PCm audio device. .El diff --git a/sys/dev/sound/pcm/sndstat.c b/sys/dev/sound/pcm/sndstat.c index 6670a1e43aac..3be376e1da01 100644 --- a/sys/dev/sound/pcm/sndstat.c +++ b/sys/dev/sound/pcm/sndstat.c @@ -392,9 +392,12 @@ sndstat_create_diinfo_nv(uint32_t min_rate, uint32_t max_rate, uint32_t formats, static int sndstat_build_sound4_nvlist(struct snddev_info *d, nvlist_t **dip) { + struct pcm_channel *c; + struct pcm_feeder *f; + struct sbuf sb; uint32_t maxrate, minrate, fmts, minchn, maxchn; - nvlist_t *di = NULL, *sound4di = NULL, *diinfo = NULL; - int err; + nvlist_t *di = NULL, *sound4di = NULL, *diinfo = NULL, *cdi = NULL; + int err, nchan; di = nvlist_create(0); if (di == NULL) { @@ -451,8 +454,116 @@ sndstat_build_sound4_nvlist(struct snddev_info *d, nvlist_t **dip) sound4di, SNDST_DSPS_SOUND4_BITPERFECT, d->flags & SD_F_BITPERFECT); nvlist_add_number(sound4di, SNDST_DSPS_SOUND4_PVCHAN, d->pvchancount); nvlist_add_number(sound4di, SNDST_DSPS_SOUND4_RVCHAN, d->rvchancount); + + nchan = 0; + CHN_FOREACH(c, d, channels.pcm) { + sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND); + cdi = nvlist_create(0); + if (cdi == NULL) { + sbuf_delete(&sb); + PCM_RELEASE_QUICK(d); + err = ENOMEM; + goto done; + } + + nvlist_add_string(cdi, SNDST_DSPS_SOUND4_CHAN_NAME, c->name); + nvlist_add_string(cdi, SNDST_DSPS_SOUND4_CHAN_PARENTCHAN, + c->parentchannel != NULL ? c->parentchannel->name : ""); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_UNIT, nchan++); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_LATENCY, + c->latency); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_RATE, c->speed); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_FORMAT, + c->format); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_PID, c->pid); + nvlist_add_string(cdi, SNDST_DSPS_SOUND4_CHAN_COMM, c->comm); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_INTR, + c->interrupts); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_FEEDCNT, + c->feedcount); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_XRUNS, c->xruns); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_LEFTVOL, + CHN_GETVOLUME(c, SND_VOL_C_PCM, SND_CHN_T_FL)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_RIGHTVOL, + CHN_GETVOLUME(c, SND_VOL_C_PCM, SND_CHN_T_FR)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_HWBUF_FORMAT, + sndbuf_getfmt(c->bufhard)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_HWBUF_SIZE, + sndbuf_getsize(c->bufhard)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_HWBUF_BLKSZ, + sndbuf_getblksz(c->bufhard)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_HWBUF_BLKCNT, + sndbuf_getblkcnt(c->bufhard)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_HWBUF_FREE, + sndbuf_getfree(c->bufhard)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_HWBUF_READY, + sndbuf_getready(c->bufhard)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_SWBUF_FORMAT, + sndbuf_getfmt(c->bufsoft)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_SWBUF_SIZE, + sndbuf_getsize(c->bufsoft)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_SWBUF_BLKSZ, + sndbuf_getblksz(c->bufsoft)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_SWBUF_BLKCNT, + sndbuf_getblkcnt(c->bufsoft)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_SWBUF_FREE, + sndbuf_getfree(c->bufsoft)); + nvlist_add_number(cdi, SNDST_DSPS_SOUND4_CHAN_SWBUF_READY, + sndbuf_getready(c->bufsoft)); + + sbuf_printf(&sb, "[%s", + (c->direction == PCMDIR_REC) ? "hardware" : "userland"); + sbuf_printf(&sb, " -> "); + f = c->feeder; + while (f->source != NULL) + f = f->source; + while (f != NULL) { + sbuf_printf(&sb, "%s", f->class->name); + if (f->desc->type == FEEDER_FORMAT) { + sbuf_printf(&sb, "(0x%08x -> 0x%08x)", + f->desc->in, f->desc->out); + } else if (f->desc->type == FEEDER_MATRIX) { + sbuf_printf(&sb, "(%d.%d -> %d.%d)", + AFMT_CHANNEL(f->desc->in) - + AFMT_EXTCHANNEL(f->desc->in), + AFMT_EXTCHANNEL(f->desc->in), + AFMT_CHANNEL(f->desc->out) - + AFMT_EXTCHANNEL(f->desc->out), + AFMT_EXTCHANNEL(f->desc->out)); + } else if (f->desc->type == FEEDER_RATE) { + sbuf_printf(&sb, + "(0x%08x q:%d %d -> %d)", + f->desc->out, + FEEDER_GET(f, FEEDRATE_QUALITY), + FEEDER_GET(f, FEEDRATE_SRC), + FEEDER_GET(f, FEEDRATE_DST)); + } else { + sbuf_printf(&sb, "(0x%08x)", + f->desc->out); + } + sbuf_printf(&sb, " -> "); + f = f->parent; + } + sbuf_printf(&sb, "%s]", + (c->direction == PCMDIR_REC) ? "userland" : "hardware"); + + sbuf_finish(&sb); + nvlist_add_string(cdi, SNDST_DSPS_SOUND4_CHAN_FEEDERCHAIN, + sbuf_data(&sb)); + sbuf_delete(&sb); + + nvlist_append_nvlist_array(sound4di, + SNDST_DSPS_SOUND4_CHAN_INFO, cdi); + nvlist_destroy(cdi); + err = nvlist_error(sound4di); + if (err) { + PCM_RELEASE_QUICK(d); + goto done; + } + } nvlist_move_nvlist(di, SNDST_DSPS_PROVIDER_INFO, sound4di); sound4di = NULL; + PCM_RELEASE_QUICK(d); nvlist_add_string(di, SNDST_DSPS_PROVIDER, SNDST_DSPS_SOUND4_PROVIDER); diff --git a/sys/sys/sndstat.h b/sys/sys/sndstat.h index e0e403b1a72a..6fef6502ec89 100644 --- a/sys/sys/sndstat.h +++ b/sys/sys/sndstat.h @@ -68,11 +68,38 @@ struct sndstioc_nv_arg { /* * sound(4)-specific name/value pair names */ -#define SNDST_DSPS_SOUND4_PROVIDER "sound(4)" -#define SNDST_DSPS_SOUND4_UNIT "unit" -#define SNDST_DSPS_SOUND4_BITPERFECT "bitperfect" -#define SNDST_DSPS_SOUND4_PVCHAN "pvchan" -#define SNDST_DSPS_SOUND4_RVCHAN "rvchan" +#define SNDST_DSPS_SOUND4_PROVIDER "sound(4)" +#define SNDST_DSPS_SOUND4_UNIT "unit" +#define SNDST_DSPS_SOUND4_BITPERFECT "bitperfect" +#define SNDST_DSPS_SOUND4_PVCHAN "pvchan" +#define SNDST_DSPS_SOUND4_RVCHAN "rvchan" +#define SNDST_DSPS_SOUND4_CHAN_INFO "channel_info" +#define SNDST_DSPS_SOUND4_CHAN_NAME "name" +#define SNDST_DSPS_SOUND4_CHAN_PARENTCHAN "parentchan" +#define SNDST_DSPS_SOUND4_CHAN_UNIT "unit" +#define SNDST_DSPS_SOUND4_CHAN_LATENCY "latency" +#define SNDST_DSPS_SOUND4_CHAN_RATE "rate" +#define SNDST_DSPS_SOUND4_CHAN_FORMAT "format" +#define SNDST_DSPS_SOUND4_CHAN_PID "pid" +#define SNDST_DSPS_SOUND4_CHAN_COMM "comm" +#define SNDST_DSPS_SOUND4_CHAN_INTR "interrupts" +#define SNDST_DSPS_SOUND4_CHAN_FEEDCNT "feedcount" +#define SNDST_DSPS_SOUND4_CHAN_XRUNS "xruns" +#define SNDST_DSPS_SOUND4_CHAN_LEFTVOL "left_volume" +#define SNDST_DSPS_SOUND4_CHAN_RIGHTVOL "right_volume" +#define SNDST_DSPS_SOUND4_CHAN_HWBUF_FORMAT "hwbuf_format" +#define SNDST_DSPS_SOUND4_CHAN_HWBUF_SIZE "hwbuf_size" +#define SNDST_DSPS_SOUND4_CHAN_HWBUF_BLKSZ "hwbuf_blksz" +#define SNDST_DSPS_SOUND4_CHAN_HWBUF_BLKCNT "hwbuf_blkcnt" +#define SNDST_DSPS_SOUND4_CHAN_HWBUF_FREE "hwbuf_free" +#define SNDST_DSPS_SOUND4_CHAN_HWBUF_READY "hwbuf_ready" +#define SNDST_DSPS_SOUND4_CHAN_SWBUF_FORMAT "swbuf_format" +#define SNDST_DSPS_SOUND4_CHAN_SWBUF_SIZE "swbuf_size" +#define SNDST_DSPS_SOUND4_CHAN_SWBUF_BLKSZ "swbuf_blksz" +#define SNDST_DSPS_SOUND4_CHAN_SWBUF_BLKCNT "swbuf_blkcnt" +#define SNDST_DSPS_SOUND4_CHAN_SWBUF_FREE "swbuf_free" +#define SNDST_DSPS_SOUND4_CHAN_SWBUF_READY "swbuf_ready" +#define SNDST_DSPS_SOUND4_CHAN_FEEDERCHAIN "feederchain" /* * Maximum user-specified nvlist buffer size From 6c5ee6e55a35421248f914c4b00dc9ed555262fe Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Fri, 7 Jun 2024 13:36:57 +0200 Subject: [PATCH 09/91] simd(7): add missing aarch64 SIMD functions Some of the string functions in contrib/arm-optimized routines are SIMD enhanced which is not reflect on the simd(7) manpage. This commit addresses that and introduces a new label A for functions enhanced with the Arm ASIMD (Neon) extension. Approved by: emaste Reviewed by: fuz (GSoC mentor) MFC to: stable/14 Sponsored by: Google LLC (GSoC 2024) Differential Revision: https://reviews.freebsd.org/D45523 --- share/man/man7/simd.7 | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index fd9485524aef..877bc77adf4b 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd December 6, 2023 +.Dd June 7, 2024 .Dt SIMD 7 .Os .Sh NAME @@ -55,33 +55,33 @@ Enhanced functions are present for the following architectures: .It bcopy Ta Ta S Ta S Ta S Ta SV .It bzero Ta Ta S Ta S Ta S .It div Ta Ta Ta S Ta S -.It index Ta S Ta Ta S1 +.It index Ta A Ta Ta S1 .It ldiv Ta Ta Ta S Ta S .It lldiv Ta Ta Ta S -.It memchr Ta S Ta Ta S1 -.It memcmp Ta S Ta S Ta S1 Ta S +.It memchr Ta A Ta Ta S1 +.It memcmp Ta A Ta S Ta S1 Ta S .It memccpy Ta Ta Ta S1 .It memcpy Ta S Ta S Ta S Ta S Ta SV .It memmove Ta S Ta S Ta S Ta S Ta SV -.It memrchr Ta Ta Ta S1 -.It memset Ta S Ta S Ta S Ta S -.It rindex Ta S Ta Ta S1 Ta S -.It stpcpy Ta S Ta Ta S1 +.It memrchr Ta A Ta Ta S1 +.It memset Ta A Ta S Ta S Ta S +.It rindex Ta A Ta Ta S1 Ta S +.It stpcpy Ta A Ta Ta S1 .It stpncpy Ta Ta Ta S1 .It strcat Ta Ta Ta S1 Ta S -.It strchr Ta S Ta Ta S1 Ta S -.It strchrnul Ta S Ta Ta S1 +.It strchr Ta A Ta Ta S1 Ta S +.It strchrnul Ta A Ta Ta S1 .It strcmp Ta S Ta S Ta S1 Ta S -.It strcpy Ta S Ta Ta S1 Ta S Ta S2 +.It strcpy Ta A Ta Ta S1 Ta S Ta S2 .It strcspn Ta Ta Ta S2 .It strlcat Ta Ta Ta S1 .It strlcpy Ta Ta Ta S1 -.It strlen Ta S Ta S Ta S1 +.It strlen Ta A Ta S Ta S1 .It strncat Ta Ta Ta S1 .It strncmp Ta S Ta S Ta S1 Ta S .It strncpy Ta Ta Ta S1 Ta Ta S2 -.It strnlen Ta S Ta Ta S1 -.It strrchr Ta S Ta Ta S1 Ta S +.It strnlen Ta A Ta Ta S1 +.It strrchr Ta A Ta Ta S1 Ta S .It strpbrk Ta Ta Ta S2 .It strsep Ta Ta Ta S2 .It strspn Ta Ta Ta S2 @@ -100,7 +100,8 @@ Enhanced functions are present for the following architectures: or PowerPC\ 2.05, .Sy 3 Ns :\ x86-64-v3, .Sy 4 Ns :\ x86-64-v4, -.Sy V Ns :\ PowerPC\ VSX. +.Sy V Ns :\ PowerPC\ VSX, +.Sy A Ns :\ Arm\ ASIMD (NEON). . .Sh ENVIRONMENT .Bl -tag From e335a74b5671fa31ea347e70f43004ec29b35280 Mon Sep 17 00:00:00 2001 From: Benedict Reuschling Date: Mon, 10 Jun 2024 08:13:58 +0000 Subject: [PATCH 10/91] accf_http.9: Fix grammar in description (singular/plural) --- share/man/man9/accf_http.9 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/share/man/man9/accf_http.9 b/share/man/man9/accf_http.9 index f48894807312..f71bb7811dc0 100644 --- a/share/man/man9/accf_http.9 +++ b/share/man/man9/accf_http.9 @@ -28,7 +28,7 @@ .Os .Sh NAME .Nm accf_http -.Nd "buffer incoming connections until a certain complete HTTP requests arrive" +.Nd "buffer incoming connections until a certain complete HTTP request arrives" .Sh SYNOPSIS .Nm options INET .Nm options ACCEPT_FILTER_HTTP From e5a54f19be8e0e4e700aaf46ab70d42d3ffba469 Mon Sep 17 00:00:00 2001 From: Benedict Reuschling Date: Mon, 10 Jun 2024 08:29:26 +0000 Subject: [PATCH 11/91] accf_tls(9): Fix grammar in description (singular/plural) --- share/man/man9/accf_tls.9 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/share/man/man9/accf_tls.9 b/share/man/man9/accf_tls.9 index 331ea2aa4fb8..d4dbc299e5bb 100644 --- a/share/man/man9/accf_tls.9 +++ b/share/man/man9/accf_tls.9 @@ -26,7 +26,7 @@ .Os .Sh NAME .Nm accf_tls -.Nd "buffer incoming connections until a TLS handshake like requests arrive" +.Nd "buffer incoming connections until a TLS handshake like request arrives" .Sh SYNOPSIS .Nm options INET .Nm options ACCEPT_FILTER_TLS From 3ff0dc1af85e253b83127ea2417a22a7b2c31f27 Mon Sep 17 00:00:00 2001 From: Kristof Provost Date: Mon, 10 Jun 2024 10:47:38 +0200 Subject: [PATCH 12/91] vmxnet3: make descriptor count checks more robust When we update credits there is a potential for a race causing an overflow of vxcr_next (i.e. incrementing it past vxcr_ndesc). Change the check to >= rather than == to be more robust against this. Reviewed by: emaste Sponsored by: Rubicon Communications, LLC ("Netgate") Differential Revision: https://reviews.freebsd.org/D43712 --- sys/dev/vmware/vmxnet3/if_vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sys/dev/vmware/vmxnet3/if_vmx.c b/sys/dev/vmware/vmxnet3/if_vmx.c index fdcad0dd4bba..62b5f313a137 100644 --- a/sys/dev/vmware/vmxnet3/if_vmx.c +++ b/sys/dev/vmware/vmxnet3/if_vmx.c @@ -1429,7 +1429,8 @@ vmxnet3_isc_txd_credits_update(void *vsc, uint16_t txqid, bool clear) return (1); vmxnet3_barrier(sc, VMXNET3_BARRIER_RD); - if (++txc->vxcr_next == txc->vxcr_ndesc) { + MPASS(txc->vxcr_next < txc->vxcr_ndesc); + if (++txc->vxcr_next >= txc->vxcr_ndesc) { txc->vxcr_next = 0; txc->vxcr_gen ^= 1; } From fd911ae609247ef5c91493fb5506e77aa6e497bc Mon Sep 17 00:00:00 2001 From: Wei Hu Date: Mon, 10 Jun 2024 12:33:46 +0000 Subject: [PATCH 13/91] Hyper-V: remove unused alloc_pcpu_ptr() Fixes: 2b887687edc25bb4553f0d8a1183f454a85d413d Sponsored by: Microsoft --- sys/dev/hyperv/vmbus/vmbus.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sys/dev/hyperv/vmbus/vmbus.c b/sys/dev/hyperv/vmbus/vmbus.c index a3daed05c21a..c1fa9107d3c2 100644 --- a/sys/dev/hyperv/vmbus/vmbus.c +++ b/sys/dev/hyperv/vmbus/vmbus.c @@ -140,7 +140,6 @@ static void vmbus_event_proc_dummy(struct vmbus_softc *, static bus_dma_tag_t vmbus_get_dma_tag(device_t parent, device_t child); static struct vmbus_softc *vmbus_sc; static void free_pcpu_ptr(void); -static void alloc_pcpu_ptr(void); SYSCTL_NODE(_hw, OID_AUTO, vmbus, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V vmbus"); From a87651e2ff189cec4c26cb220354f1bc93794f31 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Mon, 10 Jun 2024 08:51:01 -0400 Subject: [PATCH 14/91] ktrace(1): add xref to namei(9) namei was mistaken for a typo (see GitHub pull request #1284). Add an xref to make it clear. Reviewed by: imp Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D45546 --- usr.bin/ktrace/ktrace.1 | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/usr.bin/ktrace/ktrace.1 b/usr.bin/ktrace/ktrace.1 index da39aac7800f..162706c65e6e 100644 --- a/usr.bin/ktrace/ktrace.1 +++ b/usr.bin/ktrace/ktrace.1 @@ -25,7 +25,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd April 20, 2024 +.Dd June 10, 2024 .Dt KTRACE 1 .Os .Sh NAME @@ -48,7 +48,8 @@ The utility enables kernel trace logging for the specified processes. Kernel trace data is logged to the file .Pa ktrace.out . -The kernel operations that are traced include system calls, namei +The kernel operations that are traced include system calls, +.Xr namei 9 translations, signal processing, and .Tn I/O . .Pp @@ -199,7 +200,8 @@ Disable tracing of all user-owned processes: .Xr truss 1 , .Xr ktrace 2 , .Xr utrace 2 , -.Xr capsicum 4 +.Xr capsicum 4 , +.Xr namei 9 .Sh HISTORY The .Nm From 87e5b17c149ef7e68c23c7dfd18734b3e0abe48d Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Sun, 9 Jun 2024 13:24:13 -0400 Subject: [PATCH 15/91] prepare-commit-msg: add 50 column marker A common convention suggests a 50 column commit subject line (the first line of the commit message). Add a 50 column marker to the commit message template Reviewed by: lwhsu Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D45538 --- tools/tools/git/hooks/prepare-commit-msg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/tools/git/hooks/prepare-commit-msg b/tools/tools/git/hooks/prepare-commit-msg index ac3844accec2..e8e0e2887c3f 100755 --- a/tools/tools/git/hooks/prepare-commit-msg +++ b/tools/tools/git/hooks/prepare-commit-msg @@ -38,7 +38,7 @@ outfile=$(mktemp /tmp/freebsd-git-commit.XXXXXXXX) cat >$outfile < Date: Mon, 10 Jun 2024 15:58:15 +0100 Subject: [PATCH 16/91] vmm: Only link the arm64 hyp code in vmm.ko once This code runs at EL2 while the kernel runs at EL1. We build these files for EL2 through a dependency in vmm_hyp_blob.elf.full so there is no need to include them in SRCS. Reviewed by: imp, kib, markj Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D45467 --- sys/modules/vmm/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile index 6737d868f2ea..8f6eb915290a 100644 --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -34,8 +34,7 @@ SRCS+= vgic.c \ vgic_v3.c \ vtimer.c -SRCS+= vmm_hyp_exception.S vmm_hyp.c - +CLEANFILES+= vmm_hyp_exception.o vmm_hyp.o CLEANFILES+= vmm_hyp_blob.elf.full CLEANFILES+= vmm_hyp_blob.elf vmm_hyp_blob.bin From 86bafddd61aba115bc46bcf1d7e0afb125850b5f Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 10 Jun 2024 15:58:22 +0100 Subject: [PATCH 17/91] arm64: Fix indentation to be consistent Adjust the mair_el1 macro indentation to be consistent with the surrounding macros. Reviewed by: emaste Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D45524 --- sys/arm64/include/armreg.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sys/arm64/include/armreg.h b/sys/arm64/include/armreg.h index 7bdb28aad1e5..194731c98902 100644 --- a/sys/arm64/include/armreg.h +++ b/sys/arm64/include/armreg.h @@ -1505,13 +1505,13 @@ #define ID_ISAR5_VCMA_IMPL (UL(0x1) << ID_ISAR5_VCMA_SHIFT) /* MAIR_EL1 - Memory Attribute Indirection Register */ -#define MAIR_ATTR_MASK(idx) (UL(0xff) << ((n)* 8)) -#define MAIR_ATTR(attr, idx) ((attr) << ((idx) * 8)) -#define MAIR_DEVICE_nGnRnE UL(0x00) -#define MAIR_DEVICE_nGnRE UL(0x04) -#define MAIR_NORMAL_NC UL(0x44) -#define MAIR_NORMAL_WT UL(0xbb) -#define MAIR_NORMAL_WB UL(0xff) +#define MAIR_ATTR_MASK(idx) (UL(0xff) << ((n)* 8)) +#define MAIR_ATTR(attr, idx) ((attr) << ((idx) * 8)) +#define MAIR_DEVICE_nGnRnE UL(0x00) +#define MAIR_DEVICE_nGnRE UL(0x04) +#define MAIR_NORMAL_NC UL(0x44) +#define MAIR_NORMAL_WT UL(0xbb) +#define MAIR_NORMAL_WB UL(0xff) /* MDCCINT_EL1 */ #define MDCCINT_EL1 MRS_REG(MDCCINT_EL1) From 6abad5b6062d9e8f7e6dd08eaa4033a5358b952e Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 10 Jun 2024 15:58:47 +0100 Subject: [PATCH 18/91] Remove the arm FIQ support It isn't used, and only masks/unmasks FIQs on the local CPU so will be broken on SMP. Reviewed by: mmel Differential Revision: https://reviews.freebsd.org/D33804 --- sys/arm/arm/autoconf.c | 2 +- sys/arm/arm/exception.S | 37 +-------- sys/arm/arm/fiq.c | 163 ---------------------------------------- sys/arm/arm/fiq_subr.S | 92 ----------------------- sys/arm/arm/machdep.c | 2 +- sys/arm/arm/pmap-v6.c | 4 +- sys/arm/arm/syscall.c | 2 - sys/arm/arm/trap-v6.c | 4 +- sys/arm/arm/undefined.c | 2 - sys/arm/include/cpu.h | 2 +- sys/arm/include/fiq.h | 71 ----------------- sys/conf/files.arm | 2 - 12 files changed, 9 insertions(+), 374 deletions(-) delete mode 100644 sys/arm/arm/fiq.c delete mode 100644 sys/arm/arm/fiq_subr.S delete mode 100644 sys/arm/include/fiq.h diff --git a/sys/arm/arm/autoconf.c b/sys/arm/arm/autoconf.c index 23289db9c19a..fe2882dc6d9f 100644 --- a/sys/arm/arm/autoconf.c +++ b/sys/arm/arm/autoconf.c @@ -90,7 +90,7 @@ static void configure_final(void *dummy) { - enable_interrupts(PSR_I | PSR_F); + enable_interrupts(PSR_I); cninit_finish(); cold = 0; } diff --git a/sys/arm/arm/exception.S b/sys/arm/arm/exception.S index d2e55bd7d84d..d0be3bd9c90a 100644 --- a/sys/arm/arm/exception.S +++ b/sys/arm/arm/exception.S @@ -165,12 +165,12 @@ _C_LABEL(dtrace_invop_jump_addr): #define DO_AST \ ldr r0, [sp]; /* Get the SPSR from stack */ \ mrs r4, cpsr; /* save CPSR */ \ - orr r1, r4, #(PSR_I|PSR_F); \ + orr r1, r4, #(PSR_I); \ msr cpsr_c, r1; /* Disable interrupts */ \ and r0, r0, #(PSR_MODE); /* Returning to USR mode? */ \ teq r0, #(PSR_USR32_MODE); \ bne 2f; /* Nope, get out now */ \ - bic r4, r4, #(PSR_I|PSR_F); \ + bic r4, r4, #(PSR_I); \ 1: GET_CURTHREAD_PTR(r5); \ ldr r1, [r5, #(TD_AST)]; \ teq r1, #0; \ @@ -178,7 +178,7 @@ _C_LABEL(dtrace_invop_jump_addr): msr cpsr_c, r4; /* Restore interrupts */ \ mov r0, sp; \ bl _C_LABEL(ast); /* ast(frame) */ \ - orr r0, r4, #(PSR_I|PSR_F); \ + orr r0, r4, #(PSR_I); \ msr cpsr_c, r0; \ b 1b; \ 2: @@ -315,21 +315,6 @@ ASENTRY_NP(irq_entry) b _C_LABEL(intr_irq_handler)/* trapframe to the handler. */ END(irq_entry) -/* - * Entry point for an FIQ interrupt. - * - * We don't currently support FIQ handlers very much. Something can - * install itself in the FIQ vector using code (that may or may not work - * these days) in fiq.c. If nobody does that and an FIQ happens, this - * default handler just disables FIQs and otherwise ignores it. - */ -ASENTRY_NP(fiq_entry) - mrs r8, cpsr /* FIQ handling isn't supported, */ - bic r8, #(PSR_F) /* just disable FIQ and return. */ - msr cpsr_c, r8 /* The r8 we trash here is the */ - subs pc, lr, #4 /* banked FIQ-mode r8. */ -END(fiq_entry) - /* * Entry point for an Address Exception exception. * This is an arm26 exception that should never happen. @@ -378,8 +363,6 @@ _C_LABEL(page0): ldr pc, .Ldata_abort_entry ldr pc, .Laddr_exception_entry ldr pc, .Lirq_entry -.fiqv: ldr pc, .Lfiq_entry - .space 256 /* room for some fiq handler code */ _C_LABEL(page0_data): .Lreset_entry: .word reset_entry @@ -389,18 +372,4 @@ _C_LABEL(page0_data): .Ldata_abort_entry: .word data_abort_entry .Laddr_exception_entry: .word addr_exception_entry .Lirq_entry: .word irq_entry -.Lfiq_entry: .word fiq_entry - -/* - * These items are used by the code in fiq.c to install what it calls the - * "null" handler. It's actually our default vector entry that just jumps - * to the default handler which just disables FIQs and returns. - */ - .global _C_LABEL(fiq_nullhandler_code), _C_LABEL(fiq_nullhandler_size) - -_C_LABEL(fiq_nullhandler_code): - .word .fiqv -_C_LABEL(fiq_nullhandler_size): - .word 4 - diff --git a/sys/arm/arm/fiq.c b/sys/arm/arm/fiq.c deleted file mode 100644 index e517bbeaf01b..000000000000 --- a/sys/arm/arm/fiq.c +++ /dev/null @@ -1,163 +0,0 @@ -/* $NetBSD: fiq.c,v 1.5 2002/04/03 23:33:27 thorpej Exp $ */ - -/*- - * SPDX-License-Identifier: BSD-4-Clause - * - * Copyright (c) 2001, 2002 Wasabi Systems, Inc. - * All rights reserved. - * - * Written by Jason R. Thorpe for Wasabi Systems, Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed for the NetBSD Project by - * Wasabi Systems, Inc. - * 4. The name of Wasabi Systems, Inc. may not be used to endorse - * or promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -TAILQ_HEAD(, fiqhandler) fiqhandler_stack = - TAILQ_HEAD_INITIALIZER(fiqhandler_stack); - -extern char *fiq_nullhandler_code; -extern uint32_t fiq_nullhandler_size; - -/* - * fiq_installhandler: - * - * Actually install the FIQ handler down at the FIQ vector. - * - * The FIQ vector is fixed by the hardware definition as the - * seventh 32-bit word in the vector page. - * - * Note: If the FIQ is invoked via an extra layer of - * indirection, the actual FIQ code store lives in the - * data segment, so there is no need to manipulate - * the vector page's protection. - */ -static void -fiq_installhandler(void *func, size_t size) -{ - const uint32_t fiqvector = 7 * sizeof(uint32_t); - - memcpy((void *)(vector_page + fiqvector), func, size); - icache_sync((vm_offset_t) fiqvector, size); -} - -/* - * fiq_claim: - * - * Claim the FIQ vector. - */ -int -fiq_claim(struct fiqhandler *fh) -{ - struct fiqhandler *ofh; - u_int oldirqstate; - int error = 0; - - if (fh->fh_size > 0x100) - return (EFBIG); - - oldirqstate = disable_interrupts(PSR_F); - - if ((ofh = TAILQ_FIRST(&fiqhandler_stack)) != NULL) { - if ((ofh->fh_flags & FH_CANPUSH) == 0) { - error = EBUSY; - goto out; - } - - /* Save the previous FIQ handler's registers. */ - if (ofh->fh_regs != NULL) - fiq_getregs(ofh->fh_regs); - } - - /* Set FIQ mode registers to ours. */ - if (fh->fh_regs != NULL) - fiq_setregs(fh->fh_regs); - - TAILQ_INSERT_HEAD(&fiqhandler_stack, fh, fh_list); - - /* Now copy the actual handler into place. */ - fiq_installhandler(fh->fh_func, fh->fh_size); - - /* Make sure FIQs are enabled when we return. */ - oldirqstate &= ~PSR_F; - - out: - restore_interrupts(oldirqstate); - return (error); -} - -/* - * fiq_release: - * - * Release the FIQ vector. - */ -void -fiq_release(struct fiqhandler *fh) -{ - u_int oldirqstate; - struct fiqhandler *ofh; - - oldirqstate = disable_interrupts(PSR_F); - - /* - * If we are the currently active FIQ handler, then we - * need to save our registers and pop the next one back - * into the vector. - */ - if (fh == TAILQ_FIRST(&fiqhandler_stack)) { - if (fh->fh_regs != NULL) - fiq_getregs(fh->fh_regs); - TAILQ_REMOVE(&fiqhandler_stack, fh, fh_list); - if ((ofh = TAILQ_FIRST(&fiqhandler_stack)) != NULL) { - if (ofh->fh_regs != NULL) - fiq_setregs(ofh->fh_regs); - fiq_installhandler(ofh->fh_func, ofh->fh_size); - } - } else - TAILQ_REMOVE(&fiqhandler_stack, fh, fh_list); - - if (TAILQ_FIRST(&fiqhandler_stack) == NULL) { - /* Copy the NULL handler back down into the vector. */ - fiq_installhandler(fiq_nullhandler_code, fiq_nullhandler_size); - - /* Make sure FIQs are disabled when we return. */ - oldirqstate |= PSR_F; - } - - restore_interrupts(oldirqstate); -} diff --git a/sys/arm/arm/fiq_subr.S b/sys/arm/arm/fiq_subr.S deleted file mode 100644 index e2fc793d2fd3..000000000000 --- a/sys/arm/arm/fiq_subr.S +++ /dev/null @@ -1,92 +0,0 @@ -/* $NetBSD: fiq_subr.S,v 1.3 2002/04/12 18:50:31 thorpej Exp $ */ - -/*- - * Copyright (c) 2001 Wasabi Systems, Inc. - * All rights reserved. - * - * Written by Jason R. Thorpe for Wasabi Systems, Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed for the NetBSD Project by - * Wasabi Systems, Inc. - * 4. The name of Wasabi Systems, Inc. may not be used to endorse - * or promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - */ - - -#include -#include - -/* - * MODE_CHANGE_NOP should be inserted between a mode change and a - * banked register (R8--R15) access. - */ -#if defined(CPU_ARM2) || defined(CPU_ARM250) -#define MODE_CHANGE_NOP mov r0, r0 -#else -#define MODE_CHANGE_NOP /* Data sheet says ARM3 doesn't need it */ -#endif - -#define SWITCH_TO_FIQ_MODE \ - mrs r2, cpsr ; \ - mov r3, r2 ; \ - bic r2, r2, #(PSR_MODE) ; \ - orr r2, r2, #(PSR_FIQ32_MODE) ; \ - msr cpsr_fsxc, r2 - -#define BACK_TO_SVC_MODE \ - msr cpsr_fsxc, r3 - -/* - * fiq_getregs: - * - * Fetch the FIQ mode banked registers into the fiqhandler - * structure. - */ -ENTRY(fiq_getregs) - SWITCH_TO_FIQ_MODE - - stmia r0, {r8-r13} - - BACK_TO_SVC_MODE - RET -END(fiq_getregs) - -/* - * fiq_setregs: - * - * Load the FIQ mode banked registers from the fiqhandler - * structure. - */ -ENTRY(fiq_setregs) - SWITCH_TO_FIQ_MODE - - ldmia r0, {r8-r13} - - BACK_TO_SVC_MODE - RET -END(fiq_setregs) - diff --git a/sys/arm/arm/machdep.c b/sys/arm/arm/machdep.c index f54113e3d0ca..fdaf5d3dade8 100644 --- a/sys/arm/arm/machdep.c +++ b/sys/arm/arm/machdep.c @@ -314,7 +314,7 @@ spinlock_enter(void) td = curthread; if (td->td_md.md_spinlock_count == 0) { - cspr = disable_interrupts(PSR_I | PSR_F); + cspr = disable_interrupts(PSR_I); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_cspr = cspr; critical_enter(); diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c index 8d7292f7aa02..de1082e7ae62 100644 --- a/sys/arm/arm/pmap-v6.c +++ b/sys/arm/arm/pmap-v6.c @@ -3364,7 +3364,7 @@ pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, * will be affected by the broken mapping, disable interrupts * until the mapping change is completed. */ - cspr = disable_interrupts(PSR_I | PSR_F); + cspr = disable_interrupts(PSR_I); pte1_clear(pte1p); pmap_tlb_flush_pte1(pmap, va, npte1); pte1_store(pte1p, npte1); @@ -3389,7 +3389,7 @@ pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, * mappings. It's absolutely safe in UP case when interrupts * are disabled. */ - cspr = disable_interrupts(PSR_I | PSR_F); + cspr = disable_interrupts(PSR_I); pte1_clear(pte1p); pmap_tlb_flush_pte1(pmap, va, npte1); pte1_store(pte1p, npte1); diff --git a/sys/arm/arm/syscall.c b/sys/arm/arm/syscall.c index 73ac0560cd91..cce63cdadf32 100644 --- a/sys/arm/arm/syscall.c +++ b/sys/arm/arm/syscall.c @@ -160,8 +160,6 @@ swi_handler(struct trapframe *frame) if (td->td_md.md_spinlock_count == 0) { if (__predict_true(frame->tf_spsr & PSR_I) == 0) enable_interrupts(PSR_I); - if (__predict_true(frame->tf_spsr & PSR_F) == 0) - enable_interrupts(PSR_F); } syscall(td, frame); diff --git a/sys/arm/arm/trap-v6.c b/sys/arm/arm/trap-v6.c index 681f1f90eec6..d58f8c2e4a48 100644 --- a/sys/arm/arm/trap-v6.c +++ b/sys/arm/arm/trap-v6.c @@ -408,8 +408,6 @@ abort_handler(struct trapframe *tf, int prefetch) if (td->td_md.md_spinlock_count == 0) { if (__predict_true(tf->tf_spsr & PSR_I) == 0) enable_interrupts(PSR_I); - if (__predict_true(tf->tf_spsr & PSR_F) == 0) - enable_interrupts(PSR_F); } p = td->td_proc; @@ -565,7 +563,7 @@ abort_fatal(struct trapframe *tf, u_int idx, u_int fsr, u_int far, mode = usermode ? "user" : "kernel"; rw_mode = fsr & FSR_WNR ? "write" : "read"; - disable_interrupts(PSR_I|PSR_F); + disable_interrupts(PSR_I); if (td != NULL) { printf("Fatal %s mode data abort: '%s' on %s\n", mode, diff --git a/sys/arm/arm/undefined.c b/sys/arm/arm/undefined.c index dc29b55ae596..e882a911ba4a 100644 --- a/sys/arm/arm/undefined.c +++ b/sys/arm/arm/undefined.c @@ -226,8 +226,6 @@ undefinedinstruction(struct trapframe *frame) /* Enable interrupts if they were enabled before the exception. */ if (__predict_true(frame->tf_spsr & PSR_I) == 0) enable_interrupts(PSR_I); - if (__predict_true(frame->tf_spsr & PSR_F) == 0) - enable_interrupts(PSR_F); VM_CNT_INC(v_trap); diff --git a/sys/arm/include/cpu.h b/sys/arm/include/cpu.h index b2c370d01cdc..75b0956d0620 100644 --- a/sys/arm/include/cpu.h +++ b/sys/arm/include/cpu.h @@ -729,7 +729,7 @@ get_cyclecount(void) #define cpu_spinwait() /* nothing */ #define cpu_lock_delay() DELAY(1) -#define ARM_NVEC 8 +#define ARM_NVEC 7 #define ARM_VEC_ALL 0xffffffff extern vm_offset_t vector_page; diff --git a/sys/arm/include/fiq.h b/sys/arm/include/fiq.h deleted file mode 100644 index f01011bb63ce..000000000000 --- a/sys/arm/include/fiq.h +++ /dev/null @@ -1,71 +0,0 @@ -/* $NetBSD: fiq.h,v 1.1 2001/12/20 01:20:23 thorpej Exp $ */ - -/*- - * SPDX-License-Identifier: BSD-4-Clause - * - * Copyright (c) 2001 Wasabi Systems, Inc. - * All rights reserved. - * - * Written by Jason R. Thorpe for Wasabi Systems, Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed for the NetBSD Project by - * Wasabi Systems, Inc. - * 4. The name of Wasabi Systems, Inc. may not be used to endorse - * or promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - */ - -#ifndef _MACHINE_FIQ_H_ -#define _MACHINE_FIQ_H_ - -#include - -struct fiqregs { - u_int fr_r8; /* FIQ mode r8 */ - u_int fr_r9; /* FIQ mode r9 */ - u_int fr_r10; /* FIQ mode r10 */ - u_int fr_r11; /* FIQ mode r11 */ - u_int fr_r12; /* FIQ mode r12 */ - u_int fr_r13; /* FIQ mode r13 */ -}; - -struct fiqhandler { - TAILQ_ENTRY(fiqhandler) fh_list;/* link in the FIQ handler stack */ - void *fh_func; /* FIQ handler routine */ - size_t fh_size; /* size of FIQ handler */ - int fh_flags; /* flags; see below */ - struct fiqregs *fh_regs; /* pointer to regs structure */ -}; - -#define FH_CANPUSH 0x01 /* can push this handler out of the way */ - -int fiq_claim(struct fiqhandler *); -void fiq_release(struct fiqhandler *); - -void fiq_getregs(struct fiqregs *); -void fiq_setregs(struct fiqregs *); - -#endif /* _MACHINE_FIQ_H_ */ diff --git a/sys/conf/files.arm b/sys/conf/files.arm index 110c0a9e9dbf..b049479fbe82 100644 --- a/sys/conf/files.arm +++ b/sys/conf/files.arm @@ -25,8 +25,6 @@ arm/arm/elf_machdep.c standard arm/arm/elf_note.S standard arm/arm/exception.S standard arm/arm/exec_machdep.c standard -arm/arm/fiq.c standard -arm/arm/fiq_subr.S standard arm/arm/fusu.S standard arm/arm/gdb_machdep.c optional gdb arm/arm/generic_timer.c optional generic_timer From 0e34d099309687f19420c615c5c56c032f30119d Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Mon, 10 Jun 2024 11:45:41 -0400 Subject: [PATCH 19/91] Add WITHOUT_LLVM_BINUTILS src.conf description Before FreeBSD 15 I plan to switch to LLVM_BINUTILS by default. Add the src.conf description now, for the benefit of downstream branches and testing/CI. PR: 258872 [exp-run] Sponsored by: The FreeBSD Foundation --- tools/build/options/WITHOUT_LLVM_BINUTILS | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 tools/build/options/WITHOUT_LLVM_BINUTILS diff --git a/tools/build/options/WITHOUT_LLVM_BINUTILS b/tools/build/options/WITHOUT_LLVM_BINUTILS new file mode 100644 index 000000000000..35cc0c0e034f --- /dev/null +++ b/tools/build/options/WITHOUT_LLVM_BINUTILS @@ -0,0 +1,17 @@ +Install ELF Tool Chain's binary utilities instead of LLVM's. +This includes +.Xr addr2line 1 , +.Xr ar 1 , +.Xr nm 1 , +.Xr objcopy 1 , +.Xr ranlib 1 , +.Xr readelf 1 , +.Xr size 1 , +and +.Xr strip 1 . +Regardless of this setting, LLVM tools are used for +.Xr c++filt 1 +and +.Xr objdump 1 . +.Xr strings 1 +is always provided by ELF Tool Chain. From 9747d11d91642cb9b81602d88e8aebeb388543c7 Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Wed, 5 Jun 2024 14:16:25 -0600 Subject: [PATCH 20/91] Add some ATF tests for ctladm So far only "ctladm port -c" and "ctladm port -r" are covered. MFC after: 2 weeks Sponsored by: Axcient Reviewed by: mav Pull Request: https://github.com/freebsd/freebsd-src/pull/1279 --- etc/mtree/BSD.tests.dist | 2 + usr.sbin/ctladm/Makefile | 3 + usr.sbin/ctladm/tests/Makefile | 10 +++ usr.sbin/ctladm/tests/port.sh | 155 +++++++++++++++++++++++++++++++++ 4 files changed, 170 insertions(+) create mode 100644 usr.sbin/ctladm/tests/Makefile create mode 100644 usr.sbin/ctladm/tests/port.sh diff --git a/etc/mtree/BSD.tests.dist b/etc/mtree/BSD.tests.dist index 3f447f9ec25e..86db4304b932 100644 --- a/etc/mtree/BSD.tests.dist +++ b/etc/mtree/BSD.tests.dist @@ -90,6 +90,8 @@ .. .. usr.sbin + ctladm + .. dtrace common aggs diff --git a/usr.sbin/ctladm/Makefile b/usr.sbin/ctladm/Makefile index 5e0df8065cce..b563891672be 100644 --- a/usr.sbin/ctladm/Makefile +++ b/usr.sbin/ctladm/Makefile @@ -23,4 +23,7 @@ MAN= ctladm.8 CFLAGS+= -DWANT_ISCSI .endif +HAS_TESTS= +SUBDIR.${MK_TESTS}+= tests + .include diff --git a/usr.sbin/ctladm/tests/Makefile b/usr.sbin/ctladm/tests/Makefile new file mode 100644 index 000000000000..73ac94d77d21 --- /dev/null +++ b/usr.sbin/ctladm/tests/Makefile @@ -0,0 +1,10 @@ + +PACKAGE= tests + +ATF_TESTS_SH= port + +# "ctladm port" does not report the name of the port just created, so we can't +# cleanup unless we assume that no other test created a port too. +TEST_METADATA+= is_exclusive="true" + +.include diff --git a/usr.sbin/ctladm/tests/port.sh b/usr.sbin/ctladm/tests/port.sh new file mode 100644 index 000000000000..b2cdea6f1de2 --- /dev/null +++ b/usr.sbin/ctladm/tests/port.sh @@ -0,0 +1,155 @@ +# Things that aren't tested due to lack of kernel support: +# * Creating camsim ports +# * Creating tpc ports +# * Creating camtgt ports +# * Creating umass ports + +# TODO +# * Creating iscsi ports +# * Creating nvmf ports +# * Creating ha ports +# * Creating fc ports + +skip_if_ctld() { + if service ctld onestatus > /dev/null; then + # If ctld is running on this server, let's not interfere. + atf_skip "Cannot run this test while ctld is running" + fi +} + +cleanup() { + driver=$1 + + if [ -e after-ports ]; then + diff before-ports after-ports | awk "/$driver/ {print \$2}" | xargs -n1 ctladm port -r -d ioctl -p + fi +} + +atf_test_case create_ioctl cleanup +create_ioctl_head() +{ + atf_set "descr" "ctladm can create a new ioctl port" + atf_set "require.user" "root" +} +create_ioctl_body() +{ + skip_if_ctld + + atf_check -o save:before-ports ctladm portlist -qf ioctl + atf_check ctladm port -c -d "ioctl" + atf_check -o save:after-ports ctladm portlist -qf ioctl + if test `wc -l before-ports | cut -w -f2` -ge `wc -l after-ports | cut -w -f2`; then + atf_fail "Did not create a new ioctl port" + fi +} +create_ioctl_cleanup() +{ + cleanup ioctl +} + +atf_test_case create_ioctl_options cleanup +create_ioctl_options_head() +{ + atf_set "descr" "ctladm can set options when creating a new ioctl port" + atf_set "require.user" "root" +} +create_ioctl_options_body() +{ + skip_if_ctld + + atf_check -o save:before-ports ctladm portlist -qf ioctl + atf_check ctladm port -c -d "ioctl" -O pp=101 -O vp=102 + atf_check -o save:after-ports ctladm portlist -qf ioctl + if test `wc -l before-ports | cut -w -f2` -ge `wc -l after-ports | cut -w -f2`; then + atf_fail "Did not create a new ioctl port" + fi + if ! egrep -q '101[[:space:]]+102' after-ports; then + ctladm portlist + atf_fail "Did not create the port with the specified options" + fi +} +create_ioctl_options_cleanup() +{ + cleanup ioctl +} + + +atf_test_case disable_ioctl cleanup +disable_ioctl_head() +{ + atf_set "descr" "ctladm can disable an ioctl port" + atf_set "require.user" "root" +} +disable_ioctl_body() +{ + skip_if_ctld + + atf_check -o save:before-ports ctladm portlist -qf ioctl + atf_check ctladm port -c -d "ioctl" + atf_check -o save:after-ports ctladm portlist -qf ioctl + if test `wc -l before-ports | cut -w -f2` -ge `wc -l after-ports | cut -w -f2`; then + atf_fail "Did not create a new ioctl port" + fi + portnum=`diff before-ports after-ports | awk '/ioctl/ {print $2}'`; + atf_check -o ignore ctladm port -o off -p $portnum + atf_check -o match:"^$portnum *NO" ctladm portlist -qf ioctl +} +disable_ioctl_cleanup() +{ + cleanup ioctl +} + +atf_test_case enable_ioctl cleanup +enable_ioctl_head() +{ + atf_set "descr" "ctladm can enable an ioctl port" + atf_set "require.user" "root" +} +enable_ioctl_body() +{ + skip_if_ctld + + atf_check -o save:before-ports ctladm portlist -qf ioctl + atf_check ctladm port -c -d "ioctl" + atf_check -o save:after-ports ctladm portlist -qf ioctl + if test `wc -l before-ports | cut -w -f2` -ge `wc -l after-ports | cut -w -f2`; then + atf_fail "Did not create a new ioctl port" + fi + portnum=`diff before-ports after-ports | awk '/ioctl/ {print $2}'`; + atf_check -o ignore ctladm port -o off -p $portnum + atf_check -o ignore ctladm port -o on -p $portnum + atf_check -o match:"^$portnum *YES" ctladm portlist -qf ioctl +} +enable_ioctl_cleanup() +{ + cleanup ioctl +} + +atf_test_case remove_ioctl +remove_ioctl_head() +{ + atf_set "descr" "ctladm can remove an ioctl port" + atf_set "require.user" "root" +} +remove_ioctl_body() +{ + skip_if_ctld + + atf_check -o save:before-ports ctladm portlist -qf ioctl + atf_check ctladm port -c -d "ioctl" + atf_check -o save:after-ports ctladm portlist -qf ioctl + if test `wc -l before-ports | cut -w -f2` -ge `wc -l after-ports | cut -w -f2`; then + atf_fail "Did not create a new ioctl port" + fi + portnum=`diff before-ports after-ports | awk '/ioctl/ {print $2}'`; + atf_check ctladm port -r -d ioctl -p $portnum +} + +atf_init_test_cases() +{ + atf_add_test_case create_ioctl + atf_add_test_case create_ioctl_options + atf_add_test_case disable_ioctl + atf_add_test_case enable_ioctl + atf_add_test_case remove_ioctl +} From 591de7534fb3acb2e6eef94a1e5e92000d2cf83d Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Wed, 5 Jun 2024 17:54:46 -0600 Subject: [PATCH 21/91] ctladm: print port number with a succesful "port -c" command Make "ctladm port -c" print the port number of the newly successful port. This way it won't have to be guessed by a subsequent "ctladm portlist" command. That means it's safe to use it concurrently with other ctladm processes. In particular, this allows the tests to be run in parallel. MFC after: 2 weeks Sponsored by: Axcient Reviewed by: mav Pull Request: https://github.com/freebsd/freebsd-src/pull/1279 --- usr.sbin/ctladm/ctladm.c | 18 +++++++++ usr.sbin/ctladm/tests/Makefile | 4 -- usr.sbin/ctladm/tests/port.sh | 70 +++++++++++++++++----------------- 3 files changed, 52 insertions(+), 40 deletions(-) diff --git a/usr.sbin/ctladm/ctladm.c b/usr.sbin/ctladm/ctladm.c index 28f9a39386d3..14951797ddf1 100644 --- a/usr.sbin/ctladm/ctladm.c +++ b/usr.sbin/ctladm/ctladm.c @@ -397,7 +397,9 @@ static struct ctladm_opts cctl_fe_table[] = { static int cctl_port(int fd, int argc, char **argv, char *combinedopt) { + char result_buf[1024]; int c; + uint64_t created_port = -1; int32_t targ_port = -1; int retval = 0; int wwnn_set = 0, wwpn_set = 0; @@ -587,6 +589,8 @@ cctl_port(int fd, int argc, char **argv, char *combinedopt) case CCTL_PORT_MODE_CREATE: { bzero(&req, sizeof(req)); strlcpy(req.driver, driver, sizeof(req.driver)); + req.result = result_buf; + req.result_len = sizeof(result_buf); if (port_mode == CCTL_PORT_MODE_REMOVE) { req.reqtype = CTL_REQ_REMOVE; @@ -619,6 +623,20 @@ cctl_port(int fd, int argc, char **argv, char *combinedopt) warnx("warning: %s", req.error_str); break; case CTL_LUN_OK: + if (port_mode == CCTL_PORT_MODE_CREATE) { + req.result_nvl = nvlist_unpack(result_buf, req.result_len, 0); + if (req.result_nvl == NULL) { + warnx("error unpacking result nvlist"); + break; + } + created_port = nvlist_get_number(req.result_nvl, "port_id"); + printf("Port created successfully\n" + "frontend: %s\n" + "port: %ju\n", driver, + (uintmax_t) created_port); + nvlist_destroy(req.result_nvl); + } else + printf("Port destroyed successfully\n"); break; default: warnx("unknown status: %d", req.status); diff --git a/usr.sbin/ctladm/tests/Makefile b/usr.sbin/ctladm/tests/Makefile index 73ac94d77d21..825e38e6c6e3 100644 --- a/usr.sbin/ctladm/tests/Makefile +++ b/usr.sbin/ctladm/tests/Makefile @@ -3,8 +3,4 @@ PACKAGE= tests ATF_TESTS_SH= port -# "ctladm port" does not report the name of the port just created, so we can't -# cleanup unless we assume that no other test created a port too. -TEST_METADATA+= is_exclusive="true" - .include diff --git a/usr.sbin/ctladm/tests/port.sh b/usr.sbin/ctladm/tests/port.sh index b2cdea6f1de2..ccc4a6fc502e 100644 --- a/usr.sbin/ctladm/tests/port.sh +++ b/usr.sbin/ctladm/tests/port.sh @@ -20,8 +20,9 @@ skip_if_ctld() { cleanup() { driver=$1 - if [ -e after-ports ]; then - diff before-ports after-ports | awk "/$driver/ {print \$2}" | xargs -n1 ctladm port -r -d ioctl -p + if [ -e port-create.txt ]; then + portnum=`awk '/port:/ {print $2}' port-create.txt` + ctladm port -r -d $driver -p $portnum fi } @@ -35,12 +36,13 @@ create_ioctl_body() { skip_if_ctld - atf_check -o save:before-ports ctladm portlist -qf ioctl - atf_check ctladm port -c -d "ioctl" - atf_check -o save:after-ports ctladm portlist -qf ioctl - if test `wc -l before-ports | cut -w -f2` -ge `wc -l after-ports | cut -w -f2`; then - atf_fail "Did not create a new ioctl port" - fi + atf_check -o save:port-create.txt ctladm port -c -d "ioctl" + atf_check egrep -q "Port created successfully" port-create.txt + atf_check egrep -q "frontend: *ioctl" port-create.txt + atf_check egrep -q "port: *[0-9]+" port-create.txt + portnum=`awk '/port:/ {print $2}' port-create.txt` + atf_check -o save:portlist.txt ctladm portlist -qf ioctl + atf_check egrep -q "$portnum *YES *ioctl *ioctl" portlist.txt } create_ioctl_cleanup() { @@ -57,13 +59,13 @@ create_ioctl_options_body() { skip_if_ctld - atf_check -o save:before-ports ctladm portlist -qf ioctl - atf_check ctladm port -c -d "ioctl" -O pp=101 -O vp=102 - atf_check -o save:after-ports ctladm portlist -qf ioctl - if test `wc -l before-ports | cut -w -f2` -ge `wc -l after-ports | cut -w -f2`; then - atf_fail "Did not create a new ioctl port" - fi - if ! egrep -q '101[[:space:]]+102' after-ports; then + atf_check -o save:port-create.txt ctladm port -c -d "ioctl" -O pp=101 -O vp=102 + atf_check egrep -q "Port created successfully" port-create.txt + atf_check egrep -q "frontend: *ioctl" port-create.txt + atf_check egrep -q "port: *[0-9]+" port-create.txt + portnum=`awk '/port:/ {print $2}' port-create.txt` + atf_check -o save:portlist.txt ctladm portlist -qf ioctl + if ! egrep -q '101[[:space:]]+102' portlist.txt; then ctladm portlist atf_fail "Did not create the port with the specified options" fi @@ -84,13 +86,9 @@ disable_ioctl_body() { skip_if_ctld - atf_check -o save:before-ports ctladm portlist -qf ioctl - atf_check ctladm port -c -d "ioctl" - atf_check -o save:after-ports ctladm portlist -qf ioctl - if test `wc -l before-ports | cut -w -f2` -ge `wc -l after-ports | cut -w -f2`; then - atf_fail "Did not create a new ioctl port" - fi - portnum=`diff before-ports after-ports | awk '/ioctl/ {print $2}'`; + atf_check -o save:port-create.txt ctladm port -c -d "ioctl" + portnum=`awk '/port:/ {print $2}' port-create.txt` + atf_check -o save:portlist.txt ctladm portlist -qf ioctl atf_check -o ignore ctladm port -o off -p $portnum atf_check -o match:"^$portnum *NO" ctladm portlist -qf ioctl } @@ -109,13 +107,9 @@ enable_ioctl_body() { skip_if_ctld - atf_check -o save:before-ports ctladm portlist -qf ioctl - atf_check ctladm port -c -d "ioctl" - atf_check -o save:after-ports ctladm portlist -qf ioctl - if test `wc -l before-ports | cut -w -f2` -ge `wc -l after-ports | cut -w -f2`; then - atf_fail "Did not create a new ioctl port" - fi - portnum=`diff before-ports after-ports | awk '/ioctl/ {print $2}'`; + atf_check -o save:port-create.txt ctladm port -c -d "ioctl" + portnum=`awk '/port:/ {print $2}' port-create.txt` + atf_check -o save:portlist.txt ctladm portlist -qf ioctl atf_check -o ignore ctladm port -o off -p $portnum atf_check -o ignore ctladm port -o on -p $portnum atf_check -o match:"^$portnum *YES" ctladm portlist -qf ioctl @@ -135,14 +129,18 @@ remove_ioctl_body() { skip_if_ctld - atf_check -o save:before-ports ctladm portlist -qf ioctl - atf_check ctladm port -c -d "ioctl" - atf_check -o save:after-ports ctladm portlist -qf ioctl - if test `wc -l before-ports | cut -w -f2` -ge `wc -l after-ports | cut -w -f2`; then - atf_fail "Did not create a new ioctl port" + # Specify exact pp and vp to make the post-removal portlist check + # unambiguous + atf_check -o save:port-create.txt ctladm port -c -d "ioctl" -O pp=10001 -O vp=10002 + portnum=`awk '/port:/ {print $2}' port-create.txt` + atf_check -o save:portlist.txt ctladm portlist -qf ioctl + atf_check -o inline:"Port destroyed successfully\n" ctladm port -r -d ioctl -p $portnum + # Check that the port was removed. A new port may have been added with + # the same ID, so match against the pp and vp numbers, too. + if ctladm portlist -qf ioctl | egrep -q "^${portnum} .*10001 *10002"; then + ctladm portlist -qf ioctl + atf_fail "port was not removed" fi - portnum=`diff before-ports after-ports | awk '/ioctl/ {print $2}'`; - atf_check ctladm port -r -d ioctl -p $portnum } atf_init_test_cases() From afecc74cd7158da8e89c26c5267bc715c2417fe7 Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Thu, 6 Jun 2024 11:19:19 -0600 Subject: [PATCH 22/91] ctladm: better documentation for adding and removing cfiscsi ports MFC after: 2 weeks Sponsored by: Axcient Reviewed by: mav Pull Request: https://github.com/freebsd/freebsd-src/pull/1279 --- sys/cam/ctl/ctl_frontend_iscsi.c | 27 ++++-- usr.sbin/ctladm/ctladm.8 | 32 ++++++- usr.sbin/ctladm/tests/port.sh | 146 ++++++++++++++++++++++++++++++- 3 files changed, 191 insertions(+), 14 deletions(-) diff --git a/sys/cam/ctl/ctl_frontend_iscsi.c b/sys/cam/ctl/ctl_frontend_iscsi.c index fe3b1a943206..e8a3fe7cd36a 100644 --- a/sys/cam/ctl/ctl_frontend_iscsi.c +++ b/sys/cam/ctl/ctl_frontend_iscsi.c @@ -2149,17 +2149,24 @@ cfiscsi_ioctl_port_create(struct ctl_req *req) uint16_t tag; target = dnvlist_get_string(req->args_nvl, "cfiscsi_target", NULL); - alias = dnvlist_get_string(req->args_nvl, "cfiscsi_target_alias", NULL); + if (target == NULL) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Missing required argument: cfiscsi_target"); + return; + } + val = dnvlist_get_string(req->args_nvl, "cfiscsi_portal_group_tag", NULL); - - if (target == NULL || val == NULL) { + if (val == NULL) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), - "Missing required argument"); + "Missing required argument: cfiscsi_portal_group_tag"); return; } + alias = dnvlist_get_string(req->args_nvl, "cfiscsi_target_alias", NULL); + tag = strtoul(val, NULL, 0); ct = cfiscsi_target_find_or_create(&cfiscsi_softc, target, alias, tag); if (ct == NULL) { @@ -2250,13 +2257,19 @@ cfiscsi_ioctl_port_remove(struct ctl_req *req) uint16_t tag; target = dnvlist_get_string(req->args_nvl, "cfiscsi_target", NULL); + if (target == NULL) { + req->status = CTL_LUN_ERROR; + snprintf(req->error_str, sizeof(req->error_str), + "Missing required argument: cfiscsi_target"); + return; + } + val = dnvlist_get_string(req->args_nvl, "cfiscsi_portal_group_tag", NULL); - - if (target == NULL || val == NULL) { + if (val == NULL) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), - "Missing required argument"); + "Missing required argument: cfiscsi_portal_group_tag"); return; } diff --git a/usr.sbin/ctladm/ctladm.8 b/usr.sbin/ctladm/ctladm.8 index b3af4b45ba26..5c6e4aa04c77 100644 --- a/usr.sbin/ctladm/ctladm.8 +++ b/usr.sbin/ctladm/ctladm.8 @@ -35,7 +35,7 @@ .\" .\" $Id: //depot/users/kenm/FreeBSD-test2/usr.sbin/ctladm/ctladm.8#3 $ .\" -.Dd June 5, 2024 +.Dd June 6, 2024 .Dt CTLADM 8 .Os .Sh NAME @@ -166,7 +166,7 @@ .Op Fl o Ar on|off .Op Fl w Ar wwpn .Op Fl W Ar wwnn -.Op Fl O Ar pp|vp +.Op Fl O Ar name=value .Op Fl p Ar targ_port .Op Fl r .Op Fl t Ar fe_type @@ -625,7 +625,7 @@ The WWNN and WWPN may both be specified at the same time, but cannot be combined with enabling/disabling or listing ports. .Bl -tag -width 12n .It Fl c -Create new frontend port using free pp and vp=0. +Create new frontend port. .It Fl d Ar driver Specify the name of the frontend driver used by the .Pq Fl c @@ -644,7 +644,31 @@ If no port number or port type is specified, all ports are turned on or off. .It Fl O Ar pp|vp Specify generic options on the ioctl frontend port. -At present, only pp and vp port numbers can be set. +The list of recognized options is driver-dependent. +The +.Dq ioctl +driver recognizes +.Dq pp +and +.Dq vp . +The +.Dq iscsi +driver recongizes +.Dq cfiscsi_portal_group_tag , +.Dq cfiscsi_target , +and +.Dq cfiscsi_target_alias . +The +.Dq nvmf +driver recognizes +.Dq subnqn , +.Dq portid , +.Dq max_io_qsize , +.Dq enable_timeout , +.Dq ioccsz , +.Dq nn , +and +.Dq serial . .It Fl p Ar targ_port Specify the frontend port number. The port numbers can be found in the frontend port list. diff --git a/usr.sbin/ctladm/tests/port.sh b/usr.sbin/ctladm/tests/port.sh index ccc4a6fc502e..1f2c9aaed5c1 100644 --- a/usr.sbin/ctladm/tests/port.sh +++ b/usr.sbin/ctladm/tests/port.sh @@ -5,11 +5,20 @@ # * Creating umass ports # TODO -# * Creating iscsi ports # * Creating nvmf ports # * Creating ha ports # * Creating fc ports +# The PGTAG can be any 16-bit number. The only constraint is that each +# PGTAG,TARGET pair must be globally unique. +PGTAG=30257 + +load_cfiscsi() { + if ! kldstat -q -m cfiscsi; then + kldload cfiscsi || atf_skip "could not load cfscsi kernel mod" + fi +} + skip_if_ctld() { if service ctld onestatus > /dev/null; then # If ctld is running on this server, let's not interfere. @@ -21,8 +30,18 @@ cleanup() { driver=$1 if [ -e port-create.txt ]; then - portnum=`awk '/port:/ {print $2}' port-create.txt` - ctladm port -r -d $driver -p $portnum + case "$driver" in + "ioctl") + PORTNUM=`awk '/port:/ {print $2}' port-create.txt` + ctladm port -r -d $driver -p $PORTNUM + ;; + "iscsi") + TARGET=`awk '/target:/ {print $2}' port-create.txt` + # PORTNUM is ignored, but must be set + PORTNUM=9999 + ctladm port -r -d $driver -p "$PORTNUM" -O cfiscsi_portal_group_tag=$PGTAG -O cfiscsi_target=$TARGET + ;; + esac fi } @@ -49,6 +68,75 @@ create_ioctl_cleanup() cleanup ioctl } +atf_test_case create_iscsi cleanup +create_iscsi_head() +{ + atf_set "descr" "ctladm can create a new iscsi port" + atf_set "require.user" "root" +} +create_iscsi_body() +{ + skip_if_ctld + load_cfiscsi + + TARGET=iqn.2018-10.myhost.create_iscsi + atf_check -o save:port-create.txt ctladm port -c -d "iscsi" -O cfiscsi_portal_group_tag=$PGTAG -O cfiscsi_target="$TARGET" + echo "target: $TARGET" >> port-create.txt + atf_check egrep -q "Port created successfully" port-create.txt + atf_check egrep -q "frontend: *iscsi" port-create.txt + atf_check egrep -q "port: *[0-9]+" port-create.txt + atf_check -o save:portlist.txt ctladm portlist -qf iscsi + # Unlike the ioctl driver, the iscsi driver creates ports in a disabled + # state, so the port's lunmap may be set before enabling it. + atf_check egrep -q "$portnum *NO *iscsi *iscsi.*$TARGET" portlist.txt +} +create_iscsi_cleanup() +{ + cleanup iscsi +} + +atf_test_case create_iscsi_alias cleanup +create_iscsi_alias_head() +{ + atf_set "descr" "ctladm can create a new iscsi port with a target alias" + atf_set "require.user" "root" +} +create_iscsi_alias_body() +{ + skip_if_ctld + load_cfiscsi + + TARGET=iqn.2018-10.myhost.create_iscsi_alias + ALIAS="foobar" + atf_check -o save:port-create.txt ctladm port -c -d "iscsi" -O cfiscsi_portal_group_tag=$PGTAG -O cfiscsi_target="$TARGET" -O cfiscsi_target_alias="$ALIAS" + echo "target: $TARGET" >> port-create.txt + atf_check egrep -q "Port created successfully" port-create.txt + atf_check egrep -q "frontend: *iscsi" port-create.txt + atf_check egrep -q "port: *[0-9]+" port-create.txt + atf_check -o save:portlist.txt ctladm portlist -qvf iscsi + atf_check egrep -q "cfiscsi_target_alias=$ALIAS" portlist.txt +} +create_iscsi_alias_cleanup() +{ + cleanup iscsi +} + +atf_test_case create_iscsi_without_required_args +create_iscsi_without_required_args_head() +{ + atf_set "descr" "ctladm will gracefully fail to create an iSCSI target if required arguments are missing" + atf_set "require.user" "root" +} +create_iscsi_without_required_args_body() +{ + skip_if_ctld + load_cfiscsi + + TARGET=iqn.2018-10.myhost.create_iscsi + atf_check -s exit:1 -e match:"Missing required argument: cfiscsi_target" ctladm port -c -d "iscsi" -O cfiscsi_portal_group_tag=$PGTAG + atf_check -s exit:1 -e match:"Missing required argument: cfiscsi_portal_group_tag" ctladm port -c -d "iscsi" -O cfiscsi_target=$TARGET +} + atf_test_case create_ioctl_options cleanup create_ioctl_options_head() { @@ -143,11 +231,63 @@ remove_ioctl_body() fi } +atf_test_case remove_iscsi +remove_iscsi_head() +{ + atf_set "descr" "ctladm can remove an iscsi port" + atf_set "require.user" "root" +} +remove_iscsi_body() +{ + skip_if_ctld + load_cfiscsi + + TARGET=iqn.2018-10.myhost.remove_iscsi + atf_check -o save:port-create.txt ctladm port -c -d "iscsi" -O cfiscsi_portal_group_tag=$PGTAG -O cfiscsi_target="$TARGET" + portnum=`awk '/port:/ {print $2}' port-create.txt` + atf_check -o save:portlist.txt ctladm portlist -qf iscsi + atf_check -o inline:"Port destroyed successfully\n" ctladm port -r -d iscsi -p 9999 -O cfiscsi_portal_group_tag=$PGTAG -O cfiscsi_target="$TARGET" + # Check that the port was removed. A new port may have been added with + # the same ID, so match against the target and tag, too. + PGTAGHEX=0x7631 # PGTAG in hex + if ctladm portlist -qf iscsi | egrep -q "^${portnum} .*$PGTAG +[0-9]+ +$TARGET,t,$PGTAGHEX"; then + ctladm portlist -qf iscsi + atf_fail "port was not removed" + fi +} + +atf_test_case remove_iscsi_without_required_args cleanup +remove_iscsi_without_required_args_head() +{ + atf_set "descr" "ctladm will gracefully fail to remove an iSCSI target if required arguments are missing" + atf_set "require.user" "root" +} +remove_iscsi_without_required_args_body() +{ + skip_if_ctld + load_cfiscsi + + TARGET=iqn.2018-10.myhost.remove_iscsi_without_required_args + atf_check -o save:port-create.txt ctladm port -c -d "iscsi" -O cfiscsi_portal_group_tag=$PGTAG -O cfiscsi_target="$TARGET" + echo "target: $TARGET" >> port-create.txt + atf_check -s exit:1 -e match:"Missing required argument: cfiscsi_portal_group_tag" ctladm port -r -d iscsi -p 9999 -O cfiscsi_target="$TARGET" + atf_check -s exit:1 -e match:"Missing required argument: cfiscsi_target" ctladm port -r -d iscsi -p 9999 -O cfiscsi_portal_group_tag=$PGTAG +} +remove_iscsi_without_required_args_cleanup() +{ + cleanup iscsi +} + atf_init_test_cases() { atf_add_test_case create_ioctl + atf_add_test_case create_iscsi + atf_add_test_case create_iscsi_without_required_args + atf_add_test_case create_iscsi_alias atf_add_test_case create_ioctl_options atf_add_test_case disable_ioctl atf_add_test_case enable_ioctl atf_add_test_case remove_ioctl + atf_add_test_case remove_iscsi + atf_add_test_case remove_iscsi_without_required_args } From edbd489d09babebdc6c03924a912013be584c409 Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Thu, 6 Jun 2024 13:14:43 -0600 Subject: [PATCH 23/91] ctladm: don't require the use of "-p" with "port -r" When removing a port, the ioctl frontend requires the "-p" argument. But other frontends, like cfiscsi, do not. So don't require that argument in the ctladm command. The frontend driver will report an error if any required argument is missing. MFC after: 2 weeks Sponsored by: Axcient Reviewed by: mav Pull Request: https://github.com/freebsd/freebsd-src/pull/1279 --- sys/cam/ctl/ctl_frontend_ioctl.c | 2 +- usr.sbin/ctladm/ctladm.8 | 3 +-- usr.sbin/ctladm/ctladm.c | 10 +++------- usr.sbin/ctladm/tests/port.sh | 28 +++++++++++++++++++++++----- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/sys/cam/ctl/ctl_frontend_ioctl.c b/sys/cam/ctl/ctl_frontend_ioctl.c index 7fc8deac82b3..3449154afb38 100644 --- a/sys/cam/ctl/ctl_frontend_ioctl.c +++ b/sys/cam/ctl/ctl_frontend_ioctl.c @@ -267,7 +267,7 @@ cfi_ioctl_port_remove(struct ctl_req *req) if (port_id == -1) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), - "port_id not provided"); + "Missing required argument: port_id"); return; } diff --git a/usr.sbin/ctladm/ctladm.8 b/usr.sbin/ctladm/ctladm.8 index 5c6e4aa04c77..72f0162eed54 100644 --- a/usr.sbin/ctladm/ctladm.8 +++ b/usr.sbin/ctladm/ctladm.8 @@ -673,8 +673,7 @@ and Specify the frontend port number. The port numbers can be found in the frontend port list. .It Fl r -Remove port specified with -.Pq Fl p Ar targ_port . +Remove a port. .It Fl t Ar fe_type Specify the frontend type used by the .Pq Fl o , diff --git a/usr.sbin/ctladm/ctladm.c b/usr.sbin/ctladm/ctladm.c index 14951797ddf1..46b7b88547dd 100644 --- a/usr.sbin/ctladm/ctladm.c +++ b/usr.sbin/ctladm/ctladm.c @@ -580,11 +580,6 @@ cctl_port(int fd, int argc, char **argv, char *combinedopt) break; } case CCTL_PORT_MODE_REMOVE: - if (targ_port == -1) { - warnx("%s: -r requires -p", __func__); - retval = 1; - goto bailout; - } /* FALLTHROUGH */ case CCTL_PORT_MODE_CREATE: { bzero(&req, sizeof(req)); @@ -594,8 +589,9 @@ cctl_port(int fd, int argc, char **argv, char *combinedopt) if (port_mode == CCTL_PORT_MODE_REMOVE) { req.reqtype = CTL_REQ_REMOVE; - nvlist_add_stringf(option_list, "port_id", "%d", - targ_port); + if (targ_port != -1) + nvlist_add_stringf(option_list, "port_id", "%d", + targ_port); } else req.reqtype = CTL_REQ_CREATE; diff --git a/usr.sbin/ctladm/tests/port.sh b/usr.sbin/ctladm/tests/port.sh index 1f2c9aaed5c1..139e1a7d29a0 100644 --- a/usr.sbin/ctladm/tests/port.sh +++ b/usr.sbin/ctladm/tests/port.sh @@ -37,8 +37,6 @@ cleanup() { ;; "iscsi") TARGET=`awk '/target:/ {print $2}' port-create.txt` - # PORTNUM is ignored, but must be set - PORTNUM=9999 ctladm port -r -d $driver -p "$PORTNUM" -O cfiscsi_portal_group_tag=$PGTAG -O cfiscsi_target=$TARGET ;; esac @@ -68,6 +66,25 @@ create_ioctl_cleanup() cleanup ioctl } +atf_test_case remove_ioctl_without_required_args cleanup +remove_ioctl_without_required_args_head() +{ + atf_set "descr" "ctladm will gracefully fail to remove an ioctl target if required arguments are missing" + atf_set "require.user" "root" +} +remove_ioctl_without_required_args_body() +{ + skip_if_ctld + + atf_check -o save:port-create.txt ctladm port -c -d "ioctl" + atf_check egrep -q "Port created successfully" port-create.txt + atf_check -s exit:1 -e match:"Missing required argument: port_id" ctladm port -r -d "ioctl" +} +remove_ioctl_without_required_args_cleanup() +{ + cleanup ioctl +} + atf_test_case create_iscsi cleanup create_iscsi_head() { @@ -246,7 +263,7 @@ remove_iscsi_body() atf_check -o save:port-create.txt ctladm port -c -d "iscsi" -O cfiscsi_portal_group_tag=$PGTAG -O cfiscsi_target="$TARGET" portnum=`awk '/port:/ {print $2}' port-create.txt` atf_check -o save:portlist.txt ctladm portlist -qf iscsi - atf_check -o inline:"Port destroyed successfully\n" ctladm port -r -d iscsi -p 9999 -O cfiscsi_portal_group_tag=$PGTAG -O cfiscsi_target="$TARGET" + atf_check -o inline:"Port destroyed successfully\n" ctladm port -r -d iscsi -O cfiscsi_portal_group_tag=$PGTAG -O cfiscsi_target="$TARGET" # Check that the port was removed. A new port may have been added with # the same ID, so match against the target and tag, too. PGTAGHEX=0x7631 # PGTAG in hex @@ -270,8 +287,8 @@ remove_iscsi_without_required_args_body() TARGET=iqn.2018-10.myhost.remove_iscsi_without_required_args atf_check -o save:port-create.txt ctladm port -c -d "iscsi" -O cfiscsi_portal_group_tag=$PGTAG -O cfiscsi_target="$TARGET" echo "target: $TARGET" >> port-create.txt - atf_check -s exit:1 -e match:"Missing required argument: cfiscsi_portal_group_tag" ctladm port -r -d iscsi -p 9999 -O cfiscsi_target="$TARGET" - atf_check -s exit:1 -e match:"Missing required argument: cfiscsi_target" ctladm port -r -d iscsi -p 9999 -O cfiscsi_portal_group_tag=$PGTAG + atf_check -s exit:1 -e match:"Missing required argument: cfiscsi_portal_group_tag" ctladm port -r -d iscsi -O cfiscsi_target="$TARGET" + atf_check -s exit:1 -e match:"Missing required argument: cfiscsi_target" ctladm port -r -d iscsi -O cfiscsi_portal_group_tag=$PGTAG } remove_iscsi_without_required_args_cleanup() { @@ -288,6 +305,7 @@ atf_init_test_cases() atf_add_test_case disable_ioctl atf_add_test_case enable_ioctl atf_add_test_case remove_ioctl + atf_add_test_case remove_ioctl_without_required_args atf_add_test_case remove_iscsi atf_add_test_case remove_iscsi_without_required_args } From 81ef0a89fcc055bdf3d0f1affbadb643412ebce9 Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Mon, 10 Jun 2024 11:29:48 -0600 Subject: [PATCH 24/91] Fix mtree entry for ctladm tests MFC after: 2 weeks MFC with: 9747d11d91642cb9b81602d88e8aebeb388543c7 Sponsored by: Axcient --- etc/mtree/BSD.tests.dist | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etc/mtree/BSD.tests.dist b/etc/mtree/BSD.tests.dist index 86db4304b932..92501766ef88 100644 --- a/etc/mtree/BSD.tests.dist +++ b/etc/mtree/BSD.tests.dist @@ -90,8 +90,6 @@ .. .. usr.sbin - ctladm - .. dtrace common aggs @@ -1225,6 +1223,8 @@ usr.sbin chown .. + ctladm + .. daemon .. etcupdate From 7affbeeab1c99685012df0d72df2d7a87e09e472 Mon Sep 17 00:00:00 2001 From: Ryan Libby Date: Mon, 10 Jun 2024 10:32:22 -0700 Subject: [PATCH 25/91] virstor: basic functional test Reviewed by: asomers, markj Sponsored by: Dell EMC Isilon Differential Revision: https://reviews.freebsd.org/D45535 --- tests/sys/geom/class/Makefile | 1 + tests/sys/geom/class/virstor/Makefile | 9 +++ tests/sys/geom/class/virstor/conf.sh | 31 +++++++++ tests/sys/geom/class/virstor/virstor_test.sh | 73 ++++++++++++++++++++ 4 files changed, 114 insertions(+) create mode 100644 tests/sys/geom/class/virstor/Makefile create mode 100644 tests/sys/geom/class/virstor/conf.sh create mode 100644 tests/sys/geom/class/virstor/virstor_test.sh diff --git a/tests/sys/geom/class/Makefile b/tests/sys/geom/class/Makefile index 10b01a043ddf..b640b0b46859 100644 --- a/tests/sys/geom/class/Makefile +++ b/tests/sys/geom/class/Makefile @@ -19,6 +19,7 @@ TESTS_SUBDIRS+= shsec TESTS_SUBDIRS+= stripe TESTS_SUBDIRS+= union TESTS_SUBDIRS+= uzip +TESTS_SUBDIRS+= virstor ${PACKAGE}FILES+= geom_subr.sh diff --git a/tests/sys/geom/class/virstor/Makefile b/tests/sys/geom/class/virstor/Makefile new file mode 100644 index 000000000000..67242879e33f --- /dev/null +++ b/tests/sys/geom/class/virstor/Makefile @@ -0,0 +1,9 @@ +PACKAGE= tests + +TESTSDIR= ${TESTSBASE}/sys/geom/class/${.CURDIR:T} + +ATF_TESTS_SH+= virstor_test + +${PACKAGE}FILES+= conf.sh + +.include diff --git a/tests/sys/geom/class/virstor/conf.sh b/tests/sys/geom/class/virstor/conf.sh new file mode 100644 index 000000000000..46b0fd1308a3 --- /dev/null +++ b/tests/sys/geom/class/virstor/conf.sh @@ -0,0 +1,31 @@ +#!/bin/sh + +class="virstor" +base=$(atf_get ident) +TEST_VIRSTOR_DEVS_FILE="${TMPDIR}/test_virstor_devs.$(basename $0)" + +gvirstor_dev_setup() +{ + # Pick a random name and record it for cleanup. + local vdevbase="$(mktemp -u virstor.XXXXXX)" || aft_fail "mktemp" + echo "$vdevbase" >> "$TEST_VIRSTOR_DEVS_FILE" + eval "${1}='${vdevbase}'" +} + +gvirstor_test_cleanup() +{ + local vdevbase + if [ -f "$TEST_VIRSTOR_DEVS_FILE" ]; then + while read vdevbase; do + if [ -c "/dev/$class/$vdevbase" ]; then + echo "# Destroying test virstor device:" \ + "$vdevbase" + gvirstor destroy "$vdevbase" + fi + done < "$TEST_VIRSTOR_DEVS_FILE" + fi + geom_test_cleanup +} + +ATF_TEST=true +. `dirname $0`/../geom_subr.sh diff --git a/tests/sys/geom/class/virstor/virstor_test.sh b/tests/sys/geom/class/virstor/virstor_test.sh new file mode 100644 index 000000000000..4f2047bffe97 --- /dev/null +++ b/tests/sys/geom/class/virstor/virstor_test.sh @@ -0,0 +1,73 @@ +# +# Copyright (c) 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# SPDX-License-Identifier: BSD-2-Clause +# + +. $(atf_get_srcdir)/conf.sh + +atf_test_case basic cleanup +basic_head() +{ + atf_set "descr" "geom virstor basic functional test" + atf_set "require.user" "root" +} +basic_body() +{ + geom_atf_test_setup + # Choose a virstor device name + gvirstor_dev_setup name + + # Create an md backing device and initialize it with junk + psecsize=512 + attach_md md -t swap -S $psecsize -s 5M || atf_fail "attach_md" + jot -b uninitialized 0 | dd status=none of=/dev/$md 2> /dev/null + + # Create a virstor device + vsizemb=64 + vsize=$((vsizemb * 1024 * 1024)) + atf_check -o ignore -e ignore \ + gvirstor label -v -s ${vsizemb}M -m 512 $name /dev/$md + devwait + vdev="/dev/$class/$name" + + ssize=$(diskinfo $vdev | awk '{print $2}') + atf_check_equal $psecsize $ssize + + size=$(diskinfo $vdev | awk '{print $3}') + atf_check_equal $vsize $size + + # Write the first and last sectors of the virtual address space + hasha=$(jot -b a 0 | head -c $ssize | sha1) + hashz=$(jot -b z 0 | head -c $ssize | sha1) + zsector=$((vsize / ssize - 1)) + jot -b a 0 | dd status=none of=$vdev bs=$ssize count=1 conv=notrunc + jot -b z 0 | dd status=none of=$vdev bs=$ssize count=1 conv=notrunc \ + seek=$zsector + + # Read back and compare + hashx=$(dd status=none if=$vdev bs=$ssize count=1 | sha1) + atf_check_equal $hasha $hashx + hashx=$(dd status=none if=$vdev bs=$ssize count=1 skip=$zsector | sha1) + atf_check_equal $hashz $hashx + + # Destroy, then retaste and reload + atf_check -o ignore gvirstor destroy $name + true > /dev/$md + devwait + + # Read back and compare + hashx=$(dd status=none if=$vdev bs=$ssize count=1 | sha1) + atf_check_equal $hasha $hashx + hashx=$(dd status=none if=$vdev bs=$ssize count=1 skip=$zsector | sha1) + atf_check_equal $hashz $hashx +} +basic_cleanup() +{ + gvirstor_test_cleanup +} + +atf_init_test_cases() +{ + atf_add_test_case basic +} From 0f409d2673f2b2f0958d66744f0a1dc877ce4ff0 Mon Sep 17 00:00:00 2001 From: Ryan Libby Date: Mon, 10 Jun 2024 11:56:18 -0700 Subject: [PATCH 26/91] BSD.tests.dist: add entry for virstor test Reviewed by: markj Fixes: 7affbeeab1c9 virstor: basic functional test Sponsored by: Dell EMC Isilon Differential Revision: https://reviews.freebsd.org/D45551 --- etc/mtree/BSD.tests.dist | 2 ++ 1 file changed, 2 insertions(+) diff --git a/etc/mtree/BSD.tests.dist b/etc/mtree/BSD.tests.dist index 92501766ef88..375e4900b5d5 100644 --- a/etc/mtree/BSD.tests.dist +++ b/etc/mtree/BSD.tests.dist @@ -835,6 +835,8 @@ etalon .. .. + virstor + .. .. .. kern From a2fda816eb054d5873be223ef2461741dfcc253c Mon Sep 17 00:00:00 2001 From: Ryan Libby Date: Mon, 10 Jun 2024 17:36:20 -0700 Subject: [PATCH 27/91] virstor: write large maps in chunks during label During the initial label of a virstor device, write out the allocation map in chunks if it is large (> 1 MB) in order to avoid large mallocs. Even though the kernel virstor geom may still do a large malloc to represent the allocation map, this may still be useful to avoid a ulimit. Reviewed by: markj Sponsored by: Dell EMC Isilon Differential Revision: https://reviews.freebsd.org/D45517 --- lib/geom/virstor/geom_virstor.c | 59 ++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/lib/geom/virstor/geom_virstor.c b/lib/geom/virstor/geom_virstor.c index 131ece0107c7..6a7dfb27fe43 100644 --- a/lib/geom/virstor/geom_virstor.c +++ b/lib/geom/virstor/geom_virstor.c @@ -157,8 +157,7 @@ virstor_label(struct gctl_req *req) char param[32]; int hardcode, nargs, error; struct virstor_map_entry *map; - size_t total_chunks; /* We'll run out of memory if - this needs to be bigger. */ + size_t total_chunks, write_max_map_entries; unsigned int map_chunks; /* Chunks needed by the map (map size). */ size_t map_size; /* In bytes. */ ssize_t written; @@ -325,28 +324,56 @@ virstor_label(struct gctl_req *req) sprintf(param, "%s%s", _PATH_DEV, name); fd = open(param, O_RDWR); } - if (fd < 0) + if (fd < 0) { gctl_error(req, "Cannot open provider %s to write map", name); + return; + } - /* Do it with calloc because there might be a need to set up chunk flags - * in the future */ - map = calloc(total_chunks, sizeof(*map)); + /* + * Initialize and write the map. Don't malloc the whole map at once, + * in case it's large. Use calloc because there might be a need to set + * up chunk flags in the future. + */ + write_max_map_entries = 1024 * 1024 / sizeof(*map); + if (write_max_map_entries > total_chunks) + write_max_map_entries = total_chunks; + map = calloc(write_max_map_entries, sizeof(*map)); if (map == NULL) { gctl_error(req, "Out of memory (need %zu bytes for allocation map)", - map_size); + write_max_map_entries * sizeof(*map)); + close(fd); + return; } - - written = pwrite(fd, map, map_size, 0); - free(map); - if ((size_t)written != map_size) { - if (verbose) { - fprintf(stderr, "\nTried to write %zu, written %zd (%s)\n", - map_size, written, strerror(errno)); + for (size_t chunk = 0; chunk < total_chunks; + chunk += write_max_map_entries) { + size_t bytes_to_write, entries_to_write; + + entries_to_write = total_chunks - chunk; + if (entries_to_write > write_max_map_entries) + entries_to_write = write_max_map_entries; + bytes_to_write = entries_to_write * sizeof(*map); + for (size_t off = 0; off < bytes_to_write; off += written) { + written = write(fd, ((char *)map) + off, + bytes_to_write - off); + if (written < 0) { + if (verbose) { + fprintf(stderr, + "\nError writing map at offset " + "%zu of %zu: %s\n", + chunk * sizeof(*map) + off, + map_size, strerror(errno)); + } + gctl_error(req, + "Error writing out allocation map!"); + free(map); + close(fd); + return; + } } - gctl_error(req, "Error writing out allocation map!"); - return; } + free(map); + map = NULL; close (fd); if (verbose) From e754909cb0aeaf759cddf79c14a04a42f8d894bc Mon Sep 17 00:00:00 2001 From: Ryan Libby Date: Mon, 10 Jun 2024 17:38:17 -0700 Subject: [PATCH 28/91] virstor: remove relation between chunk size and MAXPHYS There's no reason why the virstor chunk size needs to relate to MAXPHYS. Remove it. Instead, just make sure that the chunk size is a multiple of the sector size. Reviewed by: imp Sponsored by: Dell EMC Isilon Differential Revision: https://reviews.freebsd.org/D45518 --- lib/geom/virstor/geom_virstor.c | 40 ++++++++++++--------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/lib/geom/virstor/geom_virstor.c b/lib/geom/virstor/geom_virstor.c index 6a7dfb27fe43..5f5087e99213 100644 --- a/lib/geom/virstor/geom_virstor.c +++ b/lib/geom/virstor/geom_virstor.c @@ -195,27 +195,6 @@ virstor_label(struct gctl_req *req) return; } - if (md.md_chunk_size % MAXPHYS != 0) { - /* XXX: This is not strictly needed, but it's convenient to - * impose some limitations on it, so why not MAXPHYS. */ - size_t new_size = rounddown(md.md_chunk_size, MAXPHYS); - if (new_size < md.md_chunk_size) - new_size += MAXPHYS; - fprintf(stderr, "Resizing chunk size to be a multiple of " - "MAXPHYS (%d kB).\n", MAXPHYS / 1024); - fprintf(stderr, "New chunk size: %zu kB\n", new_size / 1024); - md.md_chunk_size = new_size; - } - - if (md.md_virsize % md.md_chunk_size != 0) { - off_t chunk_count = md.md_virsize / md.md_chunk_size; - md.md_virsize = chunk_count * md.md_chunk_size; - fprintf(stderr, "Resizing virtual size to be a multiple of " - "chunk size.\n"); - fprintf(stderr, "New virtual size: %zu MB\n", - (size_t)(md.md_virsize/(1024 * 1024))); - } - msize = secsize = 0; for (i = 1; i < (unsigned)nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); @@ -240,11 +219,20 @@ virstor_label(struct gctl_req *req) } if (md.md_chunk_size % secsize != 0) { - fprintf(stderr, "Error: chunk size is not a multiple of sector " - "size."); - gctl_error(req, "Chunk size (in bytes) must be multiple of %u.", - (unsigned int)secsize); - return; + size_t new_size = roundup(md.md_chunk_size, secsize); + fprintf(stderr, "Resizing chunk size to be a multiple of " + "sector size (%zu bytes).\n", secsize); + fprintf(stderr, "New chunk size: %zu kB\n", new_size / 1024); + md.md_chunk_size = new_size; + } + + if (md.md_virsize % md.md_chunk_size != 0) { + off_t chunk_count = md.md_virsize / md.md_chunk_size; + md.md_virsize = chunk_count * md.md_chunk_size; + fprintf(stderr, "Resizing virtual size to be a multiple of " + "chunk size.\n"); + fprintf(stderr, "New virtual size: %zu MB\n", + (size_t)(md.md_virsize / (1024 * 1024))); } total_chunks = md.md_virsize / md.md_chunk_size; From f65d0b18d9372b522e247c7bd58422a7ab3d30d8 Mon Sep 17 00:00:00 2001 From: Peter Holm Date: Tue, 11 Jun 2024 09:38:13 +0200 Subject: [PATCH 29/91] stress2: Replace rename() with arc4random() --- tools/test/stress2/misc/nfsrename.sh | 4 ++-- tools/test/stress2/misc/umountf2.sh | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/test/stress2/misc/nfsrename.sh b/tools/test/stress2/misc/nfsrename.sh index b6513fba1553..864c64aed8ac 100755 --- a/tools/test/stress2/misc/nfsrename.sh +++ b/tools/test/stress2/misc/nfsrename.sh @@ -137,7 +137,7 @@ write_file(void) unlink(path); } - fprintf(fp, "blah blah blah garbage %ld\n", random()); + fprintf(fp, "blah blah blah garbage %ld\n", (long)arc4random()); fclose(fp); if (rename(path, filename) < 0) { warn("rename"); @@ -150,7 +150,7 @@ random_sleep(int base, int slop) { long val; - val = random() % slop; + val = arc4random() % slop; usleep(base + val); } diff --git a/tools/test/stress2/misc/umountf2.sh b/tools/test/stress2/misc/umountf2.sh index 5018a804c403..cd5b53616f97 100755 --- a/tools/test/stress2/misc/umountf2.sh +++ b/tools/test/stress2/misc/umountf2.sh @@ -886,11 +886,12 @@ test(void) { unsigned long offset; unsigned long size = maxoplen; - unsigned long rv = random(); + unsigned long rv; unsigned long op = rv % (3 + !lite + mapped_writes); /* turn off the map read if necessary */ + arc4random_buf(&rv, sizeof(rv)); if (op == 2 && !mapped_reads) op = 0; From e02d20ddff7f9f9509b28095459327bc183dab8a Mon Sep 17 00:00:00 2001 From: Wei Hu Date: Tue, 11 Jun 2024 10:05:21 +0000 Subject: [PATCH 30/91] Hyper_V: add a boot parameter to tlb flush hypercall Add boot parameter hw.vmbus.tlb_hcall for tlb flush hypercall. By default it is set to 1 to allow hyercall tlb flush. It can be set to 0 in loader.conf to turn off hypercall and use system provided tlb flush routine. The change also changes flag in the per cpu contiguous memory allocation to no wait to avoid panic happened some cases which there are no enough contiguous memery available at boot time. Reported by: gbe Tested by: whu MFC after: 1 week Fixes: 2b887687edc25bb4553f0d8a1183f454a85d413d Sponsored by: Microsoft --- sys/dev/hyperv/vmbus/vmbus.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/sys/dev/hyperv/vmbus/vmbus.c b/sys/dev/hyperv/vmbus/vmbus.c index c1fa9107d3c2..f55f0329b017 100644 --- a/sys/dev/hyperv/vmbus/vmbus.c +++ b/sys/dev/hyperv/vmbus/vmbus.c @@ -147,6 +147,13 @@ SYSCTL_NODE(_hw, OID_AUTO, vmbus, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, static int vmbus_pin_evttask = 1; SYSCTL_INT(_hw_vmbus, OID_AUTO, pin_evttask, CTLFLAG_RDTUN, &vmbus_pin_evttask, 0, "Pin event tasks to their respective CPU"); + +#if defined(__x86_64__) +static int hv_tlb_hcall = 1; +SYSCTL_INT(_hw_vmbus, OID_AUTO, tlb_hcall , CTLFLAG_RDTUN, + &hv_tlb_hcall, 0, "Use Hyper_V hyercall for tlb flush"); +#endif + uint32_t vmbus_current_version; static const uint32_t vmbus_version[] = { @@ -756,8 +763,19 @@ vmbus_synic_setup(void *xsc) if (VMBUS_PCPU_GET(sc, vcpuid, cpu) > hv_max_vp_index) hv_max_vp_index = VMBUS_PCPU_GET(sc, vcpuid, cpu); hv_cpu_mem = DPCPU_ID_PTR(cpu, hv_pcpu_mem); - *hv_cpu_mem = contigmalloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, + *hv_cpu_mem = contigmalloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); + +#if defined(__x86_64__) + if (*hv_cpu_mem == NULL && hv_tlb_hcall) { + hv_tlb_hcall = 0; + if (bootverbose && sc) + device_printf(sc->vmbus_dev, + "cannot alloc contig memory for hv_pcpu_mem, " + "use system provided tlb flush call.\n"); + } +#endif + /* * Setup the SynIC message. */ @@ -1502,7 +1520,8 @@ vmbus_doattach(struct vmbus_softc *sc) sc->vmbus_flags |= VMBUS_FLAG_SYNIC; #if defined(__x86_64__) - smp_targeted_tlb_shootdown = &hyperv_vm_tlb_flush; + if (hv_tlb_hcall) + smp_targeted_tlb_shootdown = &hyperv_vm_tlb_flush; #endif /* From abf239cf097b7a16defb9857027165427341c38c Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Tue, 11 Jun 2024 09:49:14 +0100 Subject: [PATCH 31/91] arm64/vmm: Add braces to fix the gcc build Reviewed by: markj, emaste Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D45548 --- sys/arm64/vmm/vmm_arm64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sys/arm64/vmm/vmm_arm64.c b/sys/arm64/vmm/vmm_arm64.c index e0547bcef914..6b058a993cdd 100644 --- a/sys/arm64/vmm/vmm_arm64.c +++ b/sys/arm64/vmm/vmm_arm64.c @@ -1352,7 +1352,7 @@ vmmops_setcap(void *vcpui, int num, int val) switch (num) { case VM_CAP_BRK_EXIT: - if ((val != 0) == (hypctx->setcaps & (1ul << num)) != 0) + if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0)) break; if (val != 0) hypctx->mdcr_el2 |= MDCR_EL2_TDE; @@ -1360,7 +1360,7 @@ vmmops_setcap(void *vcpui, int num, int val) hypctx->mdcr_el2 &= ~MDCR_EL2_TDE; break; case VM_CAP_SS_EXIT: - if ((val != 0) == (hypctx->setcaps & (1ul << num)) != 0) + if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0)) break; if (val != 0) { @@ -1382,7 +1382,7 @@ vmmops_setcap(void *vcpui, int num, int val) } break; case VM_CAP_MASK_HWINTR: - if ((val != 0) == (hypctx->setcaps & (1ul << num)) != 0) + if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0)) break; if (val != 0) { From 9a4813e1dc1781d952d8db017fc7cc005095c6a0 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Tue, 11 Jun 2024 09:49:32 +0100 Subject: [PATCH 32/91] bhyve: Fix the build with gcc gcc doesn't like const and static to not be at the start of a variable declaration. Update the gdb_regset arrays to make it more obvious they are arrays of struct gdb_reg and to fix the gcc build. Reviewed by: corvink, markj Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D45550 --- usr.sbin/bhyve/gdb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/usr.sbin/bhyve/gdb.c b/usr.sbin/bhyve/gdb.c index 6df8026a0245..a13166e1e2c8 100644 --- a/usr.sbin/bhyve/gdb.c +++ b/usr.sbin/bhyve/gdb.c @@ -166,10 +166,10 @@ static bool gdb_active = false; struct gdb_reg { enum vm_reg_name id; int size; -} +}; #ifdef __amd64__ -static const gdb_regset[] = { +static const struct gdb_reg gdb_regset[] = { { .id = VM_REG_GUEST_RAX, .size = 8 }, { .id = VM_REG_GUEST_RBX, .size = 8 }, { .id = VM_REG_GUEST_RCX, .size = 8 }, @@ -212,7 +212,7 @@ static const gdb_regset[] = { { .id = VM_REG_GUEST_EFER, .size = 8 }, }; #else /* __aarch64__ */ -static const gdb_regset[] = { +static const struct gdb_reg gdb_regset[] = { { .id = VM_REG_GUEST_X0, .size = 8 }, { .id = VM_REG_GUEST_X1, .size = 8 }, { .id = VM_REG_GUEST_X2, .size = 8 }, From cb53f83d8ad8ee6fd9778c38b30e71244254e5bb Mon Sep 17 00:00:00 2001 From: Peter Holm Date: Tue, 11 Jun 2024 15:51:21 +0200 Subject: [PATCH 33/91] Revert "stress2: Replace rename() with arc4random()" This reverts commit f65d0b18d9372b522e247c7bd58422a7ab3d30d8. Misleading commit message --- tools/test/stress2/misc/nfsrename.sh | 4 ++-- tools/test/stress2/misc/umountf2.sh | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/test/stress2/misc/nfsrename.sh b/tools/test/stress2/misc/nfsrename.sh index 864c64aed8ac..b6513fba1553 100755 --- a/tools/test/stress2/misc/nfsrename.sh +++ b/tools/test/stress2/misc/nfsrename.sh @@ -137,7 +137,7 @@ write_file(void) unlink(path); } - fprintf(fp, "blah blah blah garbage %ld\n", (long)arc4random()); + fprintf(fp, "blah blah blah garbage %ld\n", random()); fclose(fp); if (rename(path, filename) < 0) { warn("rename"); @@ -150,7 +150,7 @@ random_sleep(int base, int slop) { long val; - val = arc4random() % slop; + val = random() % slop; usleep(base + val); } diff --git a/tools/test/stress2/misc/umountf2.sh b/tools/test/stress2/misc/umountf2.sh index cd5b53616f97..5018a804c403 100755 --- a/tools/test/stress2/misc/umountf2.sh +++ b/tools/test/stress2/misc/umountf2.sh @@ -886,12 +886,11 @@ test(void) { unsigned long offset; unsigned long size = maxoplen; - unsigned long rv; + unsigned long rv = random(); unsigned long op = rv % (3 + !lite + mapped_writes); /* turn off the map read if necessary */ - arc4random_buf(&rv, sizeof(rv)); if (op == 2 && !mapped_reads) op = 0; From ff4a72c159e06c15a4573b9a4b40f6dc55544538 Mon Sep 17 00:00:00 2001 From: Peter Holm Date: Tue, 11 Jun 2024 15:52:34 +0200 Subject: [PATCH 34/91] stress2: Replace random() with arc4random() --- tools/test/stress2/misc/nfsrename.sh | 4 ++-- tools/test/stress2/misc/umountf2.sh | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/test/stress2/misc/nfsrename.sh b/tools/test/stress2/misc/nfsrename.sh index b6513fba1553..864c64aed8ac 100755 --- a/tools/test/stress2/misc/nfsrename.sh +++ b/tools/test/stress2/misc/nfsrename.sh @@ -137,7 +137,7 @@ write_file(void) unlink(path); } - fprintf(fp, "blah blah blah garbage %ld\n", random()); + fprintf(fp, "blah blah blah garbage %ld\n", (long)arc4random()); fclose(fp); if (rename(path, filename) < 0) { warn("rename"); @@ -150,7 +150,7 @@ random_sleep(int base, int slop) { long val; - val = random() % slop; + val = arc4random() % slop; usleep(base + val); } diff --git a/tools/test/stress2/misc/umountf2.sh b/tools/test/stress2/misc/umountf2.sh index 5018a804c403..cd5b53616f97 100755 --- a/tools/test/stress2/misc/umountf2.sh +++ b/tools/test/stress2/misc/umountf2.sh @@ -886,11 +886,12 @@ test(void) { unsigned long offset; unsigned long size = maxoplen; - unsigned long rv = random(); + unsigned long rv; unsigned long op = rv % (3 + !lite + mapped_writes); /* turn off the map read if necessary */ + arc4random_buf(&rv, sizeof(rv)); if (op == 2 && !mapped_reads) op = 0; From 0277c0c6f72a964bf0439db1f4558551052e4f3b Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Mon, 10 Jun 2024 20:31:08 -0400 Subject: [PATCH 35/91] ktrace(1): add more xrefs Following commit a87651e2ff18 add xrefs to intro(2) and sigaction(2), and use a consistent form. Suggested by: kib, arrowd Reviewed by: kib (earlier) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D45555 --- usr.bin/ktrace/ktrace.1 | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/usr.bin/ktrace/ktrace.1 b/usr.bin/ktrace/ktrace.1 index 162706c65e6e..5cc6df52d69a 100644 --- a/usr.bin/ktrace/ktrace.1 +++ b/usr.bin/ktrace/ktrace.1 @@ -48,9 +48,13 @@ The utility enables kernel trace logging for the specified processes. Kernel trace data is logged to the file .Pa ktrace.out . -The kernel operations that are traced include system calls, -.Xr namei 9 -translations, signal processing, and +The kernel operations that are traced include system calls +.Pq see Xr intro 2 , +file system path lookups +.Pq Xr namei 9 , +signal processing +.Pq Xr sigaction 2 , +and .Tn I/O . .Pp Once tracing is enabled on a process, trace data will be logged until @@ -120,7 +124,9 @@ trace page faults trace .Tn I/O .It Cm n -trace namei translations +trace +.Xr namei 9 +translations .It Cm p trace capability check failures .It Cm s @@ -198,7 +204,9 @@ Disable tracing of all user-owned processes: .Xr dtrace 1 , .Xr kdump 1 , .Xr truss 1 , +.Xr intro 2 , .Xr ktrace 2 , +.Xr sigaction 2 , .Xr utrace 2 , .Xr capsicum 4 , .Xr namei 9 From dd0e5c02ab13b9eb240d42a71a8f41a8b036bd33 Mon Sep 17 00:00:00 2001 From: Doug Moore Date: Tue, 11 Jun 2024 11:36:23 -0500 Subject: [PATCH 36/91] swap_pager: small improvement to find_least Drop an unneeded test, a branch and a needless computation to save a few instructions. Reviewed by: kib Differential Revision: https://reviews.freebsd.org/D45558 --- sys/vm/swap_pager.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index a07a38481490..15f227457bba 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -2289,22 +2289,17 @@ swap_pager_find_least(vm_object_t object, vm_pindex_t pindex) VM_OBJECT_ASSERT_LOCKED(object); MPASS((object->flags & OBJ_SWAP) != 0); - if (pctrie_is_empty(&object->un_pager.swp.swp_blks)) - return (object->size); sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL) return (object->size); - if (sb->p < pindex) { - for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) { - if (sb->d[i] != SWAPBLK_NONE) - return (sb->p + i); - } - sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, - roundup(pindex, SWAP_META_PAGES)); - if (sb == NULL) - return (object->size); + for (i = pindex - sb->p; i < SWAP_META_PAGES; i++) { + if (sb->d[i] != SWAPBLK_NONE) + return (sb->p + i); } + sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, pindex); + if (sb == NULL) + return (object->size); for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) return (sb->p + i); @@ -2314,7 +2309,7 @@ swap_pager_find_least(vm_object_t object, vm_pindex_t pindex) * We get here if a swblk is present in the trie but it * doesn't map any blocks. */ - MPASS(0); + __unreachable(); return (object->size); } From 703768a23590d8faf65b0f16dd395248ff7273f6 Mon Sep 17 00:00:00 2001 From: Jessica Clarke Date: Tue, 11 Jun 2024 17:43:18 +0100 Subject: [PATCH 37/91] mx25l.4: Document the correct disk device path This was true at time of commit, but the path was changed 2 weeks later to just be the /dev/flash/spiN name, without updating the manpage. Reported by: David Gilbert Fixes: 68dd77957786 ("Give the mx25l device sole ownership of the name /dev/flash/spi* ...") MFC after: 1 week --- share/man/man4/mx25l.4 | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/share/man/man4/mx25l.4 b/share/man/man4/mx25l.4 index ff2e78a8c5aa..5dffdebcf43e 100644 --- a/share/man/man4/mx25l.4 +++ b/share/man/man4/mx25l.4 @@ -22,7 +22,7 @@ .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" -.Dd February 26, 2019 +.Dd June 11, 2024 .Dt MX25L 4 .Os .Sh NAME @@ -60,7 +60,7 @@ such as block size, sector size, and device capacity. When a supported device is found, the .Nm driver creates a disk device and makes it accessible at -.Pa /dev/flash/mx25l? . +.Pa /dev/flash/spi? . The new disk device is then tasted by the available .Xr geom 4 modules as with any disk device. @@ -189,13 +189,9 @@ Set the high bit (1 << 31) to invert the logic level of the chip select line. The SPI mode (0-3) to use when communicating with this device. .El .Sh FILES -.Bl -tag -width /dev/flash/mx25l? -.It Pa /dev/flash/mx25l? -Provides read/write access to the storage device. +.Bl -tag -width /dev/flash/spi? .It Pa /dev/flash/spi? -An alias for the -.Pa /dev/mx25l? -device, for backwards compatibility with older versions of the driver. +Provides read/write access to the storage device. .El .Sh SEE ALSO .Xr fdt 4 , From 92927b8bcf51dcbcf99d633c1b3cab3cab2373ac Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Tue, 11 Jun 2024 20:23:44 -0400 Subject: [PATCH 38/91] msun: update Clang bug reference in fma test LLVM bugzilla bug 8100 became issue #8472 with the migration to GitHub. https://github.com/llvm/llvm-project/issues/8472 --- lib/msun/src/s_fma.c | 4 ++-- lib/msun/src/s_fmal.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/msun/src/s_fma.c b/lib/msun/src/s_fma.c index 4d08b40cc71a..686e80e66f5d 100644 --- a/lib/msun/src/s_fma.c +++ b/lib/msun/src/s_fma.c @@ -244,7 +244,7 @@ fma(double x, double y, double z) zs = copysign(DBL_MIN, zs); fesetround(FE_TONEAREST); - /* work around clang bug 8100 */ + /* work around clang issue #8472 */ volatile double vxs = xs; /* @@ -278,7 +278,7 @@ fma(double x, double y, double z) * rounding modes. */ fesetround(oround); - /* work around clang bug 8100 */ + /* work around clang issue #8472 */ volatile double vrlo = r.lo; adj = vrlo + xy.lo; return (ldexp(r.hi + adj, spread)); diff --git a/lib/msun/src/s_fmal.c b/lib/msun/src/s_fmal.c index 12f9c364670b..a53d85f5ca57 100644 --- a/lib/msun/src/s_fmal.c +++ b/lib/msun/src/s_fmal.c @@ -225,7 +225,7 @@ fmal(long double x, long double y, long double z) zs = copysignl(LDBL_MIN, zs); fesetround(FE_TONEAREST); - /* work around clang bug 8100 */ + /* work around clang issue #8472 */ volatile long double vxs = xs; /* @@ -259,7 +259,7 @@ fmal(long double x, long double y, long double z) * rounding modes. */ fesetround(oround); - /* work around clang bug 8100 */ + /* work around clang issue #8472 */ volatile long double vrlo = r.lo; adj = vrlo + xy.lo; return (ldexpl(r.hi + adj, spread)); From e77ad954bb825983b4346b9cc646c9c910b1be24 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Tue, 11 Jun 2024 21:34:02 -0400 Subject: [PATCH 39/91] Revert "libm: fma: correct zero sign with small inputs" This change introduced a test failure, so revert until that can be addressed. This reverts commit 888796ade2842486d3167067e8034254c38aadd3. PR: 277783 Reported by: rlibby Sponsored by: The FreeBSD Foundation --- lib/msun/src/s_fma.c | 4 +--- lib/msun/src/s_fmal.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/lib/msun/src/s_fma.c b/lib/msun/src/s_fma.c index 686e80e66f5d..16902d321290 100644 --- a/lib/msun/src/s_fma.c +++ b/lib/msun/src/s_fma.c @@ -267,9 +267,7 @@ fma(double x, double y, double z) */ fesetround(oround); volatile double vzs = zs; /* XXX gcc CSE bug workaround */ - xs = ldexp(xy.lo, spread); - xy.hi += vzs; - return (xy.hi == 0 ? xs : xy.hi + xs); + return (xy.hi + vzs + ldexp(xy.lo, spread)); } if (oround != FE_TONEAREST) { diff --git a/lib/msun/src/s_fmal.c b/lib/msun/src/s_fmal.c index a53d85f5ca57..9d08bc72e12e 100644 --- a/lib/msun/src/s_fmal.c +++ b/lib/msun/src/s_fmal.c @@ -248,9 +248,7 @@ fmal(long double x, long double y, long double z) */ fesetround(oround); volatile long double vzs = zs; /* XXX gcc CSE bug workaround */ - xs = ldexpl(xy.lo, spread); - xy.hi += vzs; - return (xy.hi == 0 ? xs : xy.hi + xs); + return (xy.hi + vzs + ldexpl(xy.lo, spread)); } if (oround != FE_TONEAREST) { From a880104a21bf41ebbb4ead26e6d4adda32bad76c Mon Sep 17 00:00:00 2001 From: Doug Moore Date: Tue, 11 Jun 2024 22:51:40 -0500 Subject: [PATCH 40/91] swap_pager: add new page range struct Define a page_range struct to pair up the two values passed to freerange functions. Have swp_pager_freeswapspace also take a page_range argument rather than a pair of arguments. In swp_pager_meta_free_all, drop a needless test and use a new helper function to do the cleanup for each swap block. Reviewed by: kib Differential Revision: https://reviews.freebsd.org/D45562 --- sys/vm/swap_pager.c | 99 +++++++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 43 deletions(-) diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 15f227457bba..3bfda3eea169 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -142,6 +142,15 @@ struct swblk { daddr_t d[SWAP_META_PAGES]; }; +/* + * A page_range structure records the start address and length of a sequence of + * mapped page addresses. + */ +struct page_range { + daddr_t start; + daddr_t num; +}; + static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data"); static struct mtx sw_dev_mtx; static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); @@ -471,7 +480,7 @@ static int swapoff_one(struct swdevt *sp, struct ucred *cred, /* * Swap bitmap functions */ -static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages); +static void swp_pager_freeswapspace(const struct page_range *range); static daddr_t swp_pager_getswapspace(int *npages); /* @@ -486,23 +495,21 @@ static void swp_pager_meta_free_all(vm_object_t); static daddr_t swp_pager_meta_lookup(vm_object_t, vm_pindex_t); static void -swp_pager_init_freerange(daddr_t *start, daddr_t *num) +swp_pager_init_freerange(struct page_range *range) { - - *start = SWAPBLK_NONE; - *num = 0; + range->start = SWAPBLK_NONE; + range->num = 0; } static void -swp_pager_update_freerange(daddr_t *start, daddr_t *num, daddr_t addr) +swp_pager_update_freerange(struct page_range *range, daddr_t addr) { - - if (*start + *num == addr) { - (*num)++; + if (range->start + range->num == addr) { + range->num++; } else { - swp_pager_freeswapspace(*start, *num); - *start = addr; - *num = 1; + swp_pager_freeswapspace(range); + range->start = addr; + range->num = 1; } } @@ -906,10 +913,13 @@ swp_pager_strategy(struct buf *bp) * This routine may not sleep. */ static void -swp_pager_freeswapspace(daddr_t blk, daddr_t npages) +swp_pager_freeswapspace(const struct page_range *range) { + daddr_t blk, npages; struct swdevt *sp; + blk = range->start; + npages = range->num; if (npages == 0) return; mtx_lock(&sw_dev_mtx); @@ -1004,11 +1014,12 @@ swap_pager_freespace_pgo(vm_object_t object, vm_pindex_t start, vm_size_t size) int swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_pindex_t size) { - daddr_t addr, blk, n_free, s_free; + struct page_range range; + daddr_t addr, blk; vm_pindex_t i, j; int n; - swp_pager_init_freerange(&s_free, &n_free); + swp_pager_init_freerange(&range); VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += n) { n = MIN(size - i, INT_MAX); @@ -1022,11 +1033,10 @@ swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_pindex_t size) addr = swp_pager_meta_build(object, start + i + j, blk + j); if (addr != SWAPBLK_NONE) - swp_pager_update_freerange(&s_free, &n_free, - addr); + swp_pager_update_freerange(&range, addr); } } - swp_pager_freeswapspace(s_free, n_free); + swp_pager_freeswapspace(&range); VM_OBJECT_WUNLOCK(object); return (0); } @@ -1195,6 +1205,7 @@ swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, static void swap_pager_unswapped(vm_page_t m) { + struct page_range range; struct swblk *sb; vm_object_t obj; @@ -1233,9 +1244,11 @@ swap_pager_unswapped(vm_page_t m) rounddown(m->pindex, SWAP_META_PAGES)); if (sb == NULL) return; - if (sb->d[m->pindex % SWAP_META_PAGES] == SWAPBLK_NONE) + range.start = sb->d[m->pindex % SWAP_META_PAGES]; + if (range.start == SWAPBLK_NONE) return; - swp_pager_freeswapspace(sb->d[m->pindex % SWAP_META_PAGES], 1); + range.num = 1; + swp_pager_freeswapspace(&range); sb->d[m->pindex % SWAP_META_PAGES] = SWAPBLK_NONE; swp_pager_free_empty_swblk(m->object, sb); } @@ -1480,8 +1493,9 @@ static void swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count, int flags, int *rtvals) { + struct page_range range; struct buf *bp; - daddr_t addr, blk, n_free, s_free; + daddr_t addr, blk; vm_page_t mreq; int i, j, n; bool async; @@ -1492,7 +1506,7 @@ swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count, VM_OBJECT_WUNLOCK(object); async = curproc == pageproc && (flags & VM_PAGER_PUT_SYNC) == 0; - swp_pager_init_freerange(&s_free, &n_free); + swp_pager_init_freerange(&range); /* * Assign swap blocks and issue I/O. We reallocate swap on the fly. @@ -1530,8 +1544,7 @@ swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count, addr = swp_pager_meta_build(mreq->object, mreq->pindex, blk + j); if (addr != SWAPBLK_NONE) - swp_pager_update_freerange(&s_free, &n_free, - addr); + swp_pager_update_freerange(&range, addr); MPASS(mreq->dirty == VM_PAGE_BITS_ALL); mreq->oflags |= VPO_SWAPINPROG; } @@ -1603,7 +1616,7 @@ swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count, */ swp_pager_async_iodone(bp); } - swp_pager_freeswapspace(s_free, n_free); + swp_pager_freeswapspace(&range); VM_OBJECT_WLOCK(object); } @@ -2131,9 +2144,9 @@ static void swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t pindex, vm_pindex_t count, vm_size_t *moved) { + struct page_range range; struct swblk *sb; vm_page_t m; - daddr_t n_free, s_free; vm_pindex_t offset, last; vm_size_t mc; int i, limit, start; @@ -2146,7 +2159,7 @@ swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject, if (count == 0 || pctrie_is_empty(&srcobject->un_pager.swp.swp_blks)) goto out; - swp_pager_init_freerange(&s_free, &n_free); + swp_pager_init_freerange(&range); offset = pindex; last = pindex + count; for (;;) { @@ -2163,8 +2176,7 @@ swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject, if (dstobject == NULL || !swp_pager_xfer_source(srcobject, dstobject, sb->p + i - offset, sb->d[i])) { - swp_pager_update_freerange(&s_free, &n_free, - sb->d[i]); + swp_pager_update_freerange(&range, sb->d[i]); } if (moved != NULL) { if (m != NULL && m->pindex != pindex + i - 1) @@ -2184,7 +2196,7 @@ swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject, uma_zfree(swblk_zone, sb); } } - swp_pager_freeswapspace(s_free, n_free); + swp_pager_freeswapspace(&range); out: if (moved != NULL) *moved = mc; @@ -2207,6 +2219,16 @@ swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count, swp_pager_meta_transfer(object, NULL, pindex, count, freed); } +static void +swp_pager_meta_free_block(struct swblk *sb, struct page_range *range) +{ + for (int i = 0; i < SWAP_META_PAGES; i++) { + if (sb->d[i] != SWAPBLK_NONE) + swp_pager_update_freerange(range, sb->d[i]); + } + uma_zfree(swblk_zone, sb); +} + /* * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object * @@ -2217,28 +2239,19 @@ static void swp_pager_meta_free_all(vm_object_t object) { struct swblk *sb; - daddr_t n_free, s_free; + struct page_range range; vm_pindex_t pindex; - int i; VM_OBJECT_ASSERT_WLOCKED(object); - if (pctrie_is_empty(&object->un_pager.swp.swp_blks)) - return; - - swp_pager_init_freerange(&s_free, &n_free); + swp_pager_init_freerange(&range); for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pindex)) != NULL;) { pindex = sb->p + SWAP_META_PAGES; - for (i = 0; i < SWAP_META_PAGES; i++) { - if (sb->d[i] == SWAPBLK_NONE) - continue; - swp_pager_update_freerange(&s_free, &n_free, sb->d[i]); - } SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); - uma_zfree(swblk_zone, sb); + swp_pager_meta_free_block(sb, &range); } - swp_pager_freeswapspace(s_free, n_free); + swp_pager_freeswapspace(&range); } /* From f0a0420dfd36ae90f86cc9bfb1342b8862b8c9ec Mon Sep 17 00:00:00 2001 From: Doug Moore Date: Wed, 12 Jun 2024 04:26:42 -0500 Subject: [PATCH 41/91] powerof2: replace loops with fls or ilog2 In several places, a loop tests for powers of two, or iterates through powers of two. In those places, replace the loop with an invocation of fls or ilog2 without changing the meaning of the code. Reviewed by: alc, markj, kib, np, erj, avg (previous version) Differential Revision: https://reviews.freebsd.org/D45494 --- sys/dev/aic7xxx/aic79xx.c | 3 +-- sys/dev/cxgb/cxgb_sge.c | 7 ++----- sys/dev/cxgbe/t4_sge.c | 4 +--- sys/dev/irdma/irdma_ctrl.c | 16 +++++----------- sys/dev/mlx5/mlx5_en/mlx5_en_main.c | 4 +--- sys/netpfil/ipfw/ip_fw_table.c | 10 +--------- 6 files changed, 11 insertions(+), 33 deletions(-) diff --git a/sys/dev/aic7xxx/aic79xx.c b/sys/dev/aic7xxx/aic79xx.c index 0c4b615c5b24..ab68c3d8b088 100644 --- a/sys/dev/aic7xxx/aic79xx.c +++ b/sys/dev/aic7xxx/aic79xx.c @@ -8593,8 +8593,7 @@ ahd_loadseq(struct ahd_softc *ahd) if (sg_prefetch_align == 0) sg_prefetch_align = 8; /* Round down to the nearest power of 2. */ - while (powerof2(sg_prefetch_align) == 0) - sg_prefetch_align--; + sg_prefetch_align = 1 << ilog2(sg_prefetch_align); cacheline_mask = sg_prefetch_align - 1; diff --git a/sys/dev/cxgb/cxgb_sge.c b/sys/dev/cxgb/cxgb_sge.c index f57494065aec..0c5be9dd6614 100644 --- a/sys/dev/cxgb/cxgb_sge.c +++ b/sys/dev/cxgb/cxgb_sge.c @@ -553,9 +553,7 @@ t3_sge_prep(adapter_t *adap, struct sge_params *p) nqsets *= adap->params.nports; fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE); - - while (!powerof2(fl_q_size)) - fl_q_size--; + fl_q_size = 1 << ilog2(fl_q_size); use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters : is_offload(adap); @@ -567,8 +565,7 @@ t3_sge_prep(adapter_t *adap, struct sge_params *p) jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE); jumbo_buf_size = MJUM9BYTES; } - while (!powerof2(jumbo_q_size)) - jumbo_q_size--; + jumbo_q_size = 1 << ilog2(jumbo_q_size); if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2)) device_printf(adap->dev, diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index e1705ae063e2..505f2d1bf677 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -4220,9 +4220,7 @@ qsize_to_fthresh(int qsize) { u_int fthresh; - while (!powerof2(qsize)) - qsize++; - fthresh = ilog2(qsize); + fthresh = qsize == 0 ? 0 : fls(qsize - 1); if (fthresh > X_CIDXFLUSHTHRESH_128) fthresh = X_CIDXFLUSHTHRESH_128; diff --git a/sys/dev/irdma/irdma_ctrl.c b/sys/dev/irdma/irdma_ctrl.c index dc42b15392c5..6078ac43815e 100644 --- a/sys/dev/irdma/irdma_ctrl.c +++ b/sys/dev/irdma/irdma_ctrl.c @@ -4909,7 +4909,7 @@ irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) struct irdma_virt_mem virt_mem; u32 i, mem_size; u32 qpwanted, mrwanted, pblewanted; - u32 powerof2, hte; + u32 hte; u32 sd_needed; u32 sd_diff; u32 loop_count = 0; @@ -4938,12 +4938,8 @@ irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) hmc_info->sd_table.sd_cnt, max_sds); qpwanted = min(qp_count, hmc_info->hmc_obj[IRDMA_HMC_IW_QP].max_cnt); - - powerof2 = 1; - while (powerof2 <= qpwanted) - powerof2 *= 2; - powerof2 /= 2; - qpwanted = powerof2; + if (qpwanted != 0) + qpwanted = 1 << ilog2(qpwanted); mrwanted = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].max_cnt; pblewanted = hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].max_cnt; @@ -4986,11 +4982,9 @@ irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) hmc_info->hmc_obj[IRDMA_HMC_IW_MR].cnt = mrwanted; hte = round_up(qpwanted + hmc_info->hmc_obj[IRDMA_HMC_IW_FSIMC].cnt, 512); - powerof2 = 1; - while (powerof2 < hte) - powerof2 *= 2; + hte = hte == 0 ? 1 : 1 << fls(hte - 1); hmc_info->hmc_obj[IRDMA_HMC_IW_HTE].cnt = - powerof2 * hmc_fpm_misc->ht_multiplier; + hte * hmc_fpm_misc->ht_multiplier; if (dev->hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_1) cfg_fpm_value_gen_1(dev, hmc_info, qpwanted); else diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c index 962705e6d258..4d9d1048448b 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c @@ -2331,9 +2331,7 @@ mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs) * Stride size is 16 * (n + 1), as the first segment is * control. */ - for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++) - ; - + n = (1 << fls(howmany(r, MLX5E_MAX_RX_BYTES))) - 1; if (n > MLX5E_MAX_BUSDMA_RX_SEGS) return (-ENOMEM); diff --git a/sys/netpfil/ipfw/ip_fw_table.c b/sys/netpfil/ipfw/ip_fw_table.c index b6847d43081b..5cf948ce39a2 100644 --- a/sys/netpfil/ipfw/ip_fw_table.c +++ b/sys/netpfil/ipfw/ip_fw_table.c @@ -1516,15 +1516,7 @@ static uint32_t roundup2p(uint32_t v) { - v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v++; - - return (v); + return (1 << fls(v - 1)); } /* From b8a496dfb6df7b86e014d0d4476cd75850e060c1 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Wed, 12 Jun 2024 11:49:05 +0000 Subject: [PATCH 42/91] lib: Remove __ARM_ARCH checks that are always true Reviewed by: imp Differential Revision: https://reviews.freebsd.org/D45559 --- lib/libc/arm/aeabi/aeabi_vfp.h | 2 +- lib/libc/arm/gen/_setjmp.S | 8 ++++---- lib/libc/arm/gen/setjmp.S | 4 ++-- lib/libsys/arm/__vdso_gettc.c | 7 ------- lib/msun/arm/fenv.c | 6 +----- 5 files changed, 8 insertions(+), 19 deletions(-) diff --git a/lib/libc/arm/aeabi/aeabi_vfp.h b/lib/libc/arm/aeabi/aeabi_vfp.h index f87f9acfd0a2..3b70fe06fab3 100644 --- a/lib/libc/arm/aeabi/aeabi_vfp.h +++ b/lib/libc/arm/aeabi/aeabi_vfp.h @@ -65,7 +65,7 @@ * C Helper macros */ -#if __ARM_ARCH >= 6 && !defined(SOFTFLOAT_FOR_GCC) +#if !defined(SOFTFLOAT_FOR_GCC) /* * Generate a function that will either call into the VFP implementation, * or the soft float version for a given __aeabi_* helper. The function diff --git a/lib/libc/arm/gen/_setjmp.S b/lib/libc/arm/gen/_setjmp.S index 19b8b6b07059..9e655d2e9e2e 100644 --- a/lib/libc/arm/gen/_setjmp.S +++ b/lib/libc/arm/gen/_setjmp.S @@ -58,12 +58,12 @@ ENTRY(_setjmp) ldr r1, .L_setjmp_magic -#if !defined(_STANDALONE) && __ARM_ARCH >= 6 && !defined(SOFTFLOAT_FOR_GCC) +#if !defined(_STANDALONE) && !defined(SOFTFLOAT_FOR_GCC) add r2, r0, #(_JB_REG_D8 * 4) vstmia r2, {d8-d15} vmrs r2, fpscr str r2, [r0, #(_JB_REG_FPSCR * 4)] -#endif /* !_STANDALONE && __ARM_ARCH >= 6 */ +#endif /* !_STANDALONE && !SOFTFLOAT_FOR_GCC */ str r1, [r0] @@ -91,12 +91,12 @@ ENTRY(_longjmp) teq ip, r2 /* magic correct? */ bne botch /* no, botch */ -#if !defined(_STANDALONE) && __ARM_ARCH >= 6 && !defined(SOFTFLOAT_FOR_GCC) +#if !defined(_STANDALONE) && !defined(SOFTFLOAT_FOR_GCC) add ip, r0, #(_JB_REG_D8 * 4) vldmia ip, {d8-d15} ldr ip, [r0, #(_JB_REG_FPSCR * 4)] vmsr fpscr, ip -#endif /* !_STANDALONE && __ARM_ARCH >= 6 */ +#endif /* !_STANDALONE && !SOFTFLOAT_FOR_GCC */ add r0, r0, #(_JB_REG_R4 * 4) /* Restore integer registers */ diff --git a/lib/libc/arm/gen/setjmp.S b/lib/libc/arm/gen/setjmp.S index 5a6c899e2b23..e7f8b788e878 100644 --- a/lib/libc/arm/gen/setjmp.S +++ b/lib/libc/arm/gen/setjmp.S @@ -61,7 +61,7 @@ ENTRY(setjmp) ldr r1, .Lsetjmp_magic -#if __ARM_ARCH >= 6 && !defined(SOFTFLOAT_FOR_GCC) +#if !defined(SOFTFLOAT_FOR_GCC) add r2, r0, #(_JB_REG_D8 * 4) vstmia r2, {d8-d15} vmrs r2, fpscr @@ -102,7 +102,7 @@ ENTRY(__longjmp) bl PIC_SYM(_C_LABEL(sigprocmask), PLT) ldmfd sp!, {r0-r2, r14} -#if __ARM_ARCH >= 6 && !defined(SOFTFLOAT_FOR_GCC) +#if !defined(SOFTFLOAT_FOR_GCC) add ip, r0, #(_JB_REG_D8 * 4) vldmia ip, {d8-d15} ldr ip, [r0, #(_JB_REG_FPSCR * 4)] diff --git a/lib/libsys/arm/__vdso_gettc.c b/lib/libsys/arm/__vdso_gettc.c index ea70dec35cd8..cb4bdec1e8ef 100644 --- a/lib/libsys/arm/__vdso_gettc.c +++ b/lib/libsys/arm/__vdso_gettc.c @@ -37,7 +37,6 @@ #include #include "libc_private.h" -#if __ARM_ARCH >= 6 static inline uint64_t cp15_cntvct_get(void) { @@ -55,7 +54,6 @@ cp15_cntpct_get(void) __asm __volatile("mrrc\tp15, 0, %Q0, %R0, c14" : "=r" (reg)); return (reg); } -#endif #pragma weak __vdso_gettc int @@ -64,7 +62,6 @@ __vdso_gettc(const struct vdso_timehands *th, u_int *tc) if (th->th_algo != VDSO_TH_ALGO_ARM_GENTIM) return (ENOSYS); -#if __ARM_ARCH >= 6 /* * Userspace gettimeofday() is only enabled on ARMv7 CPUs, but * libc is compiled for ARMv6. Due to clang issues, .arch @@ -73,10 +70,6 @@ __vdso_gettc(const struct vdso_timehands *th, u_int *tc) __asm __volatile(".word\t0xf57ff06f" : : : "memory"); /* isb */ *tc = th->th_physical == 0 ? cp15_cntvct_get() : cp15_cntpct_get(); return (0); -#else - *tc = 0; - return (ENOSYS); -#endif } #pragma weak __vdso_gettimekeep diff --git a/lib/msun/arm/fenv.c b/lib/msun/arm/fenv.c index 9f172d5fd7c9..05b3adb05f81 100644 --- a/lib/msun/arm/fenv.c +++ b/lib/msun/arm/fenv.c @@ -32,10 +32,6 @@ #include -#if __ARM_ARCH >= 6 -#define FENV_ARMv6 -#endif - /* When SOFTFP_ABI is defined we are using the softfp ABI. */ #if defined(__VFP_FP__) && !defined(__ARM_PCS_VFP) #define SOFTFP_ABI @@ -52,7 +48,7 @@ const fenv_t __fe_dfl_env = 0; /* If this is a non-mangled softfp version special processing is required */ -#if defined(FENV_MANGLE) || !defined(SOFTFP_ABI) || !defined(FENV_ARMv6) +#if defined(FENV_MANGLE) || !defined(SOFTFP_ABI) /* * The following macros map between the softfloat emulator's flags and From bbdf32d94c2f746b914aa87b1104b579426d8100 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Wed, 12 Jun 2024 11:49:13 +0000 Subject: [PATCH 43/91] nanobsd: Remove pre-armv6 support Remove support for pre-armv6 from nanobsd. It was removed from FreeBSD in 2020. Reviewed by: imp, emaste Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D45560 --- tools/tools/nanobsd/embedded/common | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/tools/nanobsd/embedded/common b/tools/tools/nanobsd/embedded/common index bcb1061df629..4f5eac7806f2 100644 --- a/tools/tools/nanobsd/embedded/common +++ b/tools/tools/nanobsd/embedded/common @@ -51,7 +51,6 @@ # o documentation for how to run the qemu images # o msdos mtools fallback # o special boot for !x86 !arm platforms -# o qemu image for arm # o qemu image for aarch64 # o qemu image for armv6/armv7 # o easy support for different image / vm formats @@ -503,9 +502,6 @@ std_amd64 ( ) { std_i386 } -std_arm ( ) { -} - std_armv6 ( ) { } From 62cb671705eb561d5a56c1c2dd2aff2ef984d035 Mon Sep 17 00:00:00 2001 From: Ruslan Bukin Date: Wed, 12 Jun 2024 13:36:05 +0100 Subject: [PATCH 44/91] riscv: include ahci device to GENERIC. This is needed for bhyve guest VM. Reviewed by: mhorne Sponsored by: UKRI Differential Revision: https://reviews.freebsd.org/D45497 --- sys/riscv/conf/GENERIC | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/riscv/conf/GENERIC b/sys/riscv/conf/GENERIC index 52edc01b5c89..ce9038cee4a9 100644 --- a/sys/riscv/conf/GENERIC +++ b/sys/riscv/conf/GENERIC @@ -93,6 +93,7 @@ device riscv_syscon device pci # Block devices +device ahci device scbus device da From a7ae78caaa17ff840da844c1dcaa780d194c9e20 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Wed, 12 Jun 2024 14:04:13 +0100 Subject: [PATCH 45/91] cdefs: Add __writeonly to mark write only vars When a variable in write only and can't be removed, e.g. for API reasons, it is useful to document this fact similar to __diagused and __witness_used. Add __writeonly to tell the compiler and anyone looking at the code that this variable is expected to only be written to, and to not raise and error. Reviewed by: imp, kib Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D45561 --- sys/sys/cdefs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/sys/cdefs.h b/sys/sys/cdefs.h index 037dfa55f923..c3268791787f 100644 --- a/sys/sys/cdefs.h +++ b/sys/sys/cdefs.h @@ -180,6 +180,7 @@ #define __aligned(x) __attribute__((__aligned__(x))) #define __section(x) __attribute__((__section__(x))) #endif +#define __writeonly __unused #if __GNUC_PREREQ__(4, 3) || __has_attribute(__alloc_size__) #define __alloc_size(x) __attribute__((__alloc_size__(x))) #define __alloc_size2(n, x) __attribute__((__alloc_size__(n, x))) From 19782e5bef3403a6ed65d46653f2e70f81dced37 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Wed, 12 Jun 2024 14:04:45 +0100 Subject: [PATCH 46/91] ibcore: Mark write-only variables Some LinuxKPI lock macros pass need a flags field passed in. This is written to but never read from so gcc complains. Fix this by marking the flags variables as unused to quieten the compiler. Reviewed by: brooks (earlier version), kib Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D45303 --- sys/ofed/drivers/infiniband/core/ib_cache.c | 16 ++++++++-------- sys/ofed/drivers/infiniband/core/ib_cm.c | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sys/ofed/drivers/infiniband/core/ib_cache.c b/sys/ofed/drivers/infiniband/core/ib_cache.c index b170f2b637f9..d2cc680796ef 100644 --- a/sys/ofed/drivers/infiniband/core/ib_cache.c +++ b/sys/ofed/drivers/infiniband/core/ib_cache.c @@ -465,7 +465,7 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev, struct ib_gid_table *table; u8 p; int local_index; - unsigned long flags; + unsigned long flags __writeonly; for (p = 0; p < ib_dev->phys_port_cnt; p++) { table = ports_table[p]; @@ -514,7 +514,7 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev, unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; - unsigned long flags; + unsigned long flags __writeonly; if (!rdma_is_port_valid(ib_dev, port)) return -ENOENT; @@ -570,7 +570,7 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; unsigned int i; - unsigned long flags; + unsigned long flags __writeonly; bool found = false; if (!ports_table) @@ -879,7 +879,7 @@ int ib_get_cached_gid(struct ib_device *device, struct ib_gid_attr *gid_attr) { int res; - unsigned long flags; + unsigned long flags __writeonly; struct ib_gid_table **ports_table = device->cache.gid_cache; struct ib_gid_table *table = ports_table[port_num - rdma_start_port(device)]; @@ -929,7 +929,7 @@ int ib_get_cached_pkey(struct ib_device *device, u16 *pkey) { struct ib_pkey_cache *cache; - unsigned long flags; + unsigned long flags __writeonly; int ret = 0; if (!rdma_is_port_valid(device, port_num)) @@ -956,7 +956,7 @@ int ib_find_cached_pkey(struct ib_device *device, u16 *index) { struct ib_pkey_cache *cache; - unsigned long flags; + unsigned long flags __writeonly; int i; int ret = -ENOENT; int partial_ix = -1; @@ -997,7 +997,7 @@ int ib_find_exact_cached_pkey(struct ib_device *device, u16 *index) { struct ib_pkey_cache *cache; - unsigned long flags; + unsigned long flags __writeonly; int i; int ret = -ENOENT; @@ -1027,7 +1027,7 @@ int ib_get_cached_lmc(struct ib_device *device, u8 port_num, u8 *lmc) { - unsigned long flags; + unsigned long flags __writeonly; int ret = 0; if (!rdma_is_port_valid(device, port_num)) diff --git a/sys/ofed/drivers/infiniband/core/ib_cm.c b/sys/ofed/drivers/infiniband/core/ib_cm.c index 3ee17a847720..7ace287b1c88 100644 --- a/sys/ofed/drivers/infiniband/core/ib_cm.c +++ b/sys/ofed/drivers/infiniband/core/ib_cm.c @@ -4057,7 +4057,7 @@ static void cm_add_one(struct ib_device *ib_device) struct ib_port_modify port_modify = { .set_port_cap_mask = IB_PORT_CM_SUP }; - unsigned long flags; + unsigned long flags __writeonly; int ret; int count = 0; u8 i; @@ -4150,7 +4150,7 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) struct ib_port_modify port_modify = { .clr_port_cap_mask = IB_PORT_CM_SUP }; - unsigned long flags; + unsigned long flags __writeonly; int i; if (!cm_dev) From 4eec584d79c1e8375d863c7eec7229ac7ec3f13b Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Wed, 12 Jun 2024 14:08:13 +0100 Subject: [PATCH 47/91] arm64: Clear td_frame when returning to userspace When returning from an exception to userspace clear the saved td_frame. On the next exception this should point to the frame, however this is not guaranteed. To ensure the trap frame pointer is either valid or NULL clear it before returning to userspace in the EL0 synchronous exception handler. Reviewed by: kib, markj Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D44807 --- sys/arm64/arm64/exception.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sys/arm64/arm64/exception.S b/sys/arm64/arm64/exception.S index 662684588e0c..3dff834e27a2 100644 --- a/sys/arm64/arm64/exception.S +++ b/sys/arm64/arm64/exception.S @@ -252,9 +252,11 @@ ENTRY(handle_el0_sync) KMSAN_ENTER ldr x0, [x18, #PC_CURTHREAD] mov x1, sp + mov x22, x0 str x1, [x0, #TD_FRAME] bl do_el0_sync do_ast + str xzr, [x22, #TD_FRAME] KMSAN_LEAVE restore_registers 0 ERET From a30149b2a9c6ac5280523eea9570e5b5e5f1fdf8 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Wed, 12 Jun 2024 14:09:14 +0100 Subject: [PATCH 48/91] arm64: Create a version of vfp_save_state for cpu_switch This will be used when we add SVE support to reduce the registers needed to be saved on context switch. Reviewed by: imp Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D43305 --- sys/arm64/arm64/swtch.S | 4 +--- sys/arm64/arm64/vfp.c | 8 ++++++++ sys/arm64/include/vfp.h | 1 + 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/sys/arm64/arm64/swtch.S b/sys/arm64/arm64/swtch.S index ca00d473fd47..6af70ca839a0 100644 --- a/sys/arm64/arm64/swtch.S +++ b/sys/arm64/arm64/swtch.S @@ -148,9 +148,7 @@ ENTRY(cpu_switch) mov x21, x2 #ifdef VFP - /* Load the pcb address */ - mov x1, x4 - bl vfp_save_state + bl vfp_save_state_switch mov x0, x20 #else mov x0, x1 diff --git a/sys/arm64/arm64/vfp.c b/sys/arm64/arm64/vfp.c index f35cd960702b..c65108a83399 100644 --- a/sys/arm64/arm64/vfp.c +++ b/sys/arm64/arm64/vfp.c @@ -216,6 +216,14 @@ vfp_save_state_savectx(struct pcb *pcb) vfp_save_state_common(curthread, pcb); } +void +vfp_save_state_switch(struct thread *td) +{ + KASSERT(td != NULL, ("NULL vfp thread")); + + vfp_save_state_common(td, td->td_pcb); +} + /* * Update the VFP state for a forked process or new thread. The PCB will * have been copied from the old thread. diff --git a/sys/arm64/include/vfp.h b/sys/arm64/include/vfp.h index 7f4c86e7737d..47d068d6050c 100644 --- a/sys/arm64/include/vfp.h +++ b/sys/arm64/include/vfp.h @@ -79,6 +79,7 @@ void vfp_reset_state(struct thread *, struct pcb *); void vfp_restore_state(void); void vfp_save_state(struct thread *, struct pcb *); void vfp_save_state_savectx(struct pcb *); +void vfp_save_state_switch(struct thread *); struct fpu_kern_ctx; From 8a9f0fa42b1c6cffd45459bb552e138083b00369 Mon Sep 17 00:00:00 2001 From: Michael Gmelin Date: Wed, 12 Jun 2024 18:11:52 +0200 Subject: [PATCH 49/91] ifconfig: Fix default netmask calculation Reported by: phk Reviewed by: emaste, kp MFC after: 3 days Differential Revision: https://reviews.freebsd.org/D45570 --- sbin/ifconfig/af_inet.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sbin/ifconfig/af_inet.c b/sbin/ifconfig/af_inet.c index 5e3084165b33..e21956cfc4fd 100644 --- a/sbin/ifconfig/af_inet.c +++ b/sbin/ifconfig/af_inet.c @@ -440,7 +440,7 @@ in_exec_nl(if_ctx *ctx, unsigned long action, void *data) static void in_setdefaultmask_nl(void) { - struct in_px *px = sintab_nl[ADDR]; + struct in_px *px = sintab_nl[ADDR]; in_addr_t i = ntohl(px->addr.s_addr); @@ -451,11 +451,11 @@ in_setdefaultmask_nl(void) * we should return an error rather than warning. */ if (IN_CLASSA(i)) - px->plen = IN_CLASSA_NSHIFT; + px->plen = 32 - IN_CLASSA_NSHIFT; else if (IN_CLASSB(i)) - px->plen = IN_CLASSB_NSHIFT; + px->plen = 32 - IN_CLASSB_NSHIFT; else - px->plen = IN_CLASSC_NSHIFT; + px->plen = 32 - IN_CLASSC_NSHIFT; px->maskset = true; } #endif From 20a2fe68faacb98b3c87ce7ea46a16b0d6c2462b Mon Sep 17 00:00:00 2001 From: Kristof Provost Date: Wed, 12 Jun 2024 20:01:58 +0200 Subject: [PATCH 50/91] pf: correctly reset max_win if the SYN-ACK lacks a wscale option. pf was setting max_win to 0 and discarded retransmitted SYN-ACK segments without wscale if the original SYN contained a wscale option. with gerhard@, ok henning@ Obtained From: OpenBSD Sponsored by: Rubicon Communications, LLC ("Netgate") --- sys/netpfil/pf/pf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index 8c97d1bf200d..c635251c3490 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -5315,8 +5315,9 @@ pf_tcp_track_full(struct pf_kstate **state, struct pfi_kkif *kif, dws = dst->wscale & PF_WSCALE_MASK; } else { /* fixup other window */ - dst->max_win <<= dst->wscale & - PF_WSCALE_MASK; + dst->max_win = MIN(TCP_MAXWIN, + (u_int32_t)dst->max_win << + (dst->wscale & PF_WSCALE_MASK)); /* in case of a retrans SYN|ACK */ dst->wscale = 0; } From 07ed2396985f211a1f9c2f84da99f955650df696 Mon Sep 17 00:00:00 2001 From: Kristof Provost Date: Wed, 12 Jun 2024 20:05:22 +0200 Subject: [PATCH 51/91] pf: make TCP sequence number tracking less strict by one octet for FIN packets The data of a TCP packet must fit into the announced window, but this is not required for the sequence number of the FIN. A packet with the FIN bit set and containing data that fits exactly into the announced window was blocked. Our stack generates such packets when the receive buffer size is set to 1024. Now pf uses only the data lenght for window comparison. OK henning@ Obtained From: OpenBSD Sponsored by: Rubicon Communications, LLC ("Netgate") --- sys/netpfil/pf/pf.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index c635251c3490..edb95d7ef0ec 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -5246,7 +5246,7 @@ pf_tcp_track_full(struct pf_kstate **state, struct pfi_kkif *kif, struct tcphdr *th = &pd->hdr.tcp; struct pf_state_peer *src, *dst; u_int16_t win = ntohs(th->th_win); - u_int32_t ack, end, seq, orig_seq; + u_int32_t ack, end, data_end, seq, orig_seq; u_int8_t sws, dws, psrc, pdst; int ackskew; @@ -5323,6 +5323,7 @@ pf_tcp_track_full(struct pf_kstate **state, struct pfi_kkif *kif, } } } + data_end = end; if (th->th_flags & TH_FIN) end++; @@ -5353,6 +5354,7 @@ pf_tcp_track_full(struct pf_kstate **state, struct pfi_kkif *kif, end = seq + pd->p_len; if (th->th_flags & TH_SYN) end++; + data_end = end; if (th->th_flags & TH_FIN) end++; } @@ -5374,7 +5376,7 @@ pf_tcp_track_full(struct pf_kstate **state, struct pfi_kkif *kif, if (seq == end) { /* Ease sequencing restrictions on no data packets */ seq = src->seqlo; - end = seq; + data_end = end = seq; } ackskew = dst->seqlo - ack; @@ -5397,7 +5399,7 @@ pf_tcp_track_full(struct pf_kstate **state, struct pfi_kkif *kif, } #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ - if (SEQ_GEQ(src->seqhi, end) && + if (SEQ_GEQ(src->seqhi, data_end) && /* Last octet inside other's window space */ SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) && /* Retrans: not more than one window back */ @@ -5471,7 +5473,7 @@ pf_tcp_track_full(struct pf_kstate **state, struct pfi_kkif *kif, } else if ((dst->state < TCPS_SYN_SENT || dst->state >= TCPS_FIN_WAIT_2 || src->state >= TCPS_FIN_WAIT_2) && - SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) && + SEQ_GEQ(src->seqhi + MAXACKWINDOW, data_end) && /* Within a window forward of the originating packet */ SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { /* Within a window backward of the originating packet */ @@ -5564,12 +5566,12 @@ pf_tcp_track_full(struct pf_kstate **state, struct pfi_kkif *kif, pd->dir == PF_IN ? "in" : "out", pd->dir == (*state)->direction ? "fwd" : "rev"); printf("pf: State failure on: %c %c %c %c | %c %c\n", - SEQ_GEQ(src->seqhi, end) ? ' ' : '1', + SEQ_GEQ(src->seqhi, data_end) ? ' ' : '1', SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ? ' ': '2', (ackskew >= -MAXACKWINDOW) ? ' ' : '3', (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4', - SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5', + SEQ_GEQ(src->seqhi + MAXACKWINDOW, data_end) ?' ' :'5', SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6'); } REASON_SET(reason, PFRES_BADSTATE); From ec1f285f2e631b8aae3e08b3f68c6451a45c0941 Mon Sep 17 00:00:00 2001 From: Rick Macklem Date: Wed, 12 Jun 2024 16:11:10 -0700 Subject: [PATCH 52/91] nfscl: Add support for the NFSv4.1/4.2 WANT_xxx flags NFSv4.1/4.2 defined new OPEN_WANT_xxx flags that a client can use to hint to the server that delegations are or are not wanted. This patch adds use of those delegations to the client. This patch should only affect the NFSv4.1/4.2 behaviour when delegations are enabled, which is not the default. MFC after: 1 month --- sys/fs/nfsclient/nfs_clrpcops.c | 77 +++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 9 deletions(-) diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index 8c5532268287..13bdc74655dd 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -389,13 +389,24 @@ nfsrpc_open(vnode_t vp, int amode, struct ucred *cred, NFSPROC_T *p) mode |= NFSV4OPEN_ACCESSREAD; if (amode & FWRITE) mode |= NFSV4OPEN_ACCESSWRITE; + if (NFSHASNFSV4N(nmp)) { + if (!NFSHASPNFS(nmp) && nfscl_enablecallb != 0 && + nfs_numnfscbd > 0) { + if ((mode & NFSV4OPEN_ACCESSWRITE) != 0) + mode |= NFSV4OPEN_WANTWRITEDELEG; + else + mode |= NFSV4OPEN_WANTANYDELEG; + } else + mode |= NFSV4OPEN_WANTNODELEG; + } nfhp = np->n_fhp; retrycnt = 0; do { dp = NULL; - error = nfscl_open(vp, nfhp->nfh_fh, nfhp->nfh_len, mode, 1, - cred, p, NULL, &op, &newone, &ret, 1, true); + error = nfscl_open(vp, nfhp->nfh_fh, nfhp->nfh_len, + (mode & NFSV4OPEN_ACCESSBOTH), 1, cred, p, NULL, + &op, &newone, &ret, 1, true); if (error) { return (error); } @@ -547,7 +558,8 @@ nfsrpc_openrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, cred); NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid); - *tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH); + *tl++ = txdr_unsigned(mode & (NFSV4OPEN_ACCESSBOTH | + NFSV4OPEN_WANTDELEGMASK)); *tl++ = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; @@ -664,6 +676,13 @@ nfsrpc_openrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, &ret, &acesize, p); if (error) goto nfsmout; + } else if (deleg == NFSV4OPEN_DELEGATENONEEXT && + NFSHASNFSV4N(nmp)) { + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + deleg = fxdr_unsigned(uint32_t, *tl); + if (deleg == NFSV4OPEN_CONTENTION || + deleg == NFSV4OPEN_RESOURCE) + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; @@ -1546,7 +1565,7 @@ nfsrpc_lookup(vnode_t dvp, char *name, int len, struct ucred *cred, NFSM_BUILD(tl, uint32_t *, 6 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_OPEN); *tl++ = 0; /* seqid, ignored. */ - *tl++ = txdr_unsigned(openmode); + *tl++ = txdr_unsigned(openmode | NFSV4OPEN_WANTNODELEG); *tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE); *tl++ = 0; /* ClientID, ignored. */ *tl = 0; @@ -1668,6 +1687,13 @@ nfsrpc_lookup(vnode_t dvp, char *name, int len, struct ucred *cred, ndp->nfsdl_stateid.other[0] = *tl++; ndp->nfsdl_stateid.other[1] = *tl++; ndp->nfsdl_stateid.other[2] = *tl++; + } else if (deleg == NFSV4OPEN_DELEGATENONEEXT && + NFSHASNFSV4N(nmp)) { + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + deleg = fxdr_unsigned(uint32_t, *tl); + if (deleg == NFSV4OPEN_CONTENTION || + deleg == NFSV4OPEN_RESOURCE) + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; @@ -2595,8 +2621,17 @@ nfsrpc_createv4(vnode_t dvp, char *name, int namelen, struct vattr *vap, */ NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(owp->nfsow_seqid); - *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | - NFSV4OPEN_ACCESSREAD); + if (NFSHASNFSV4N(nmp)) { + if (!NFSHASPNFS(nmp) && nfscl_enablecallb != 0 && + nfs_numnfscbd > 0) + *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | + NFSV4OPEN_ACCESSREAD | NFSV4OPEN_WANTWRITEDELEG); + else + *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | + NFSV4OPEN_ACCESSREAD | NFSV4OPEN_WANTNODELEG); + } else + *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | + NFSV4OPEN_ACCESSREAD); *tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; @@ -2714,6 +2749,13 @@ nfsrpc_createv4(vnode_t dvp, char *name, int namelen, struct vattr *vap, &ret, &acesize, p); if (error) goto nfsmout; + } else if (deleg == NFSV4OPEN_DELEGATENONEEXT && + NFSHASNFSV4N(nmp)) { + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + deleg = fxdr_unsigned(uint32_t, *tl); + if (deleg == NFSV4OPEN_CONTENTION || + deleg == NFSV4OPEN_RESOURCE) + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; @@ -8109,7 +8151,8 @@ nfsrpc_openlayoutrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, 0, 0, cred); NFSM_BUILD(tl, uint32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid); - *tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH); + *tl++ = txdr_unsigned(mode & (NFSV4OPEN_ACCESSBOTH | + NFSV4OPEN_WANTDELEGMASK)); *tl++ = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; @@ -8210,6 +8253,13 @@ nfsrpc_openlayoutrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, &ret, &acesize, p); if (error != 0) goto nfsmout; + } else if (deleg == NFSV4OPEN_DELEGATENONEEXT && + NFSHASNFSV4N(nmp)) { + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); + deleg = fxdr_unsigned(uint32_t, *tl); + if (deleg == NFSV4OPEN_CONTENTION || + deleg == NFSV4OPEN_RESOURCE) + NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; @@ -8301,8 +8351,17 @@ nfsrpc_createlayout(vnode_t dvp, char *name, int namelen, struct vattr *vap, */ NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(owp->nfsow_seqid); - *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | - NFSV4OPEN_ACCESSREAD); + if (NFSHASNFSV4N(nmp)) { + if (!NFSHASPNFS(nmp) && nfscl_enablecallb != 0 && + nfs_numnfscbd > 0) + *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | + NFSV4OPEN_ACCESSREAD | NFSV4OPEN_WANTWRITEDELEG); + else + *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | + NFSV4OPEN_ACCESSREAD | NFSV4OPEN_WANTNODELEG); + } else + *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | + NFSV4OPEN_ACCESSREAD); *tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; From 4308d6e0fc09c807483f09b8fea7f5182d19fd01 Mon Sep 17 00:00:00 2001 From: Rick Macklem Date: Wed, 12 Jun 2024 16:17:23 -0700 Subject: [PATCH 53/91] nfscl: Add a check for VREG for delegations Since delegations are only issued for regular files, check v_type to see if the query is for a regular file. This is a simple optimization for the non-VREG case. While here, fix a couple of global variable declarations. This patch should only affect the NFSv4.1/4.2 behaviour when delegations are enabled, which is not the default. MFC after: 1 month --- sys/fs/nfsclient/nfs_clstate.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sys/fs/nfsclient/nfs_clstate.c b/sys/fs/nfsclient/nfs_clstate.c index 9fbaa6e63a56..aef5d71592af 100644 --- a/sys/fs/nfsclient/nfs_clstate.c +++ b/sys/fs/nfsclient/nfs_clstate.c @@ -93,9 +93,8 @@ NFSREQSPINLOCK; NFSCLSTATEMUTEX; int nfscl_inited = 0; struct nfsclhead nfsclhead; /* Head of clientid list */ -int nfscl_deleghighwater = NFSCLDELEGHIGHWATER; -int nfscl_layouthighwater = NFSCLLAYOUTHIGHWATER; +static int nfscl_deleghighwater = NFSCLDELEGHIGHWATER; static int nfscl_delegcnt = 0; static int nfscl_layoutcnt = 0; static int nfscl_getopen(struct nfsclownerhead *, struct nfsclopenhash *, @@ -4647,7 +4646,7 @@ nfscl_mustflush(vnode_t vp) np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); - if (!NFSHASNFSV4(nmp)) + if (!NFSHASNFSV4(nmp) || vp->v_type != VREG) return (1); NFSLOCKMNT(nmp); if ((nmp->nm_privflag & NFSMNTP_DELEGISSUED) == 0) { @@ -4687,7 +4686,7 @@ nfscl_nodeleg(vnode_t vp, int writedeleg) np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); - if (!NFSHASNFSV4(nmp)) + if (!NFSHASNFSV4(nmp) || vp->v_type != VREG) return (1); NFSLOCKMNT(nmp); if ((nmp->nm_privflag & NFSMNTP_DELEGISSUED) == 0) { From bb53f071e85a2ebb5b405e7fec4661a725b7caf5 Mon Sep 17 00:00:00 2001 From: Rick Macklem Date: Wed, 12 Jun 2024 16:41:12 -0700 Subject: [PATCH 54/91] nfscl: Add support for read delegations and atomic upgrade For NFSv4.1/4.2, an atomic upgrade of a delegation from a read delegation to a write delegation is allowed and can result in significantly improved performance. This patch adds this upgrade to the NFSv4.1/4.2 client and enables use of read delegations. For a test case of building a FreeBSD kernel (sources and output objects) over a NFSv4.2 mount, these changes reduced the elapsed time by 30% and included a reduction of 80% for RPC counts when delegations were enabled. As such, with this patch there are at least certain cases where enabling delegations seems to be worth the increased complexity they bring. This patch should only affect the NFSv4.1/4.2 behaviour when delegations are enabled, which is not the default. MFC after: 1 month --- sys/fs/nfsclient/nfs_clstate.c | 38 ++++++++++++++++++++-------------- sys/fs/nfsclient/nfs_clsubs.c | 6 +++--- sys/fs/nfsclient/nfs_clvnops.c | 2 +- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/sys/fs/nfsclient/nfs_clstate.c b/sys/fs/nfsclient/nfs_clstate.c index aef5d71592af..0a1eb51e279f 100644 --- a/sys/fs/nfsclient/nfs_clstate.c +++ b/sys/fs/nfsclient/nfs_clstate.c @@ -439,18 +439,6 @@ nfscl_deleg(mount_t mp, struct nfsclclient *clp, u_int8_t *nfhp, KASSERT(mp != NULL, ("nfscl_deleg: mp NULL")); nmp = VFSTONFS(mp); - /* - * First, if we have received a Read delegation for a file on a - * read/write file system, just return it, because they aren't - * useful, imho. - */ - if (dp != NULL && !NFSMNT_RDONLY(mp) && - (dp->nfsdl_flags & NFSCLDL_READ)) { - nfscl_trydelegreturn(dp, cred, nmp, p); - free(dp, M_NFSCLDELEG); - *dpp = NULL; - return (0); - } /* * Since a delegation might be added to the mount, @@ -478,17 +466,35 @@ nfscl_deleg(mount_t mp, struct nfsclclient *clp, u_int8_t *nfhp, nfscl_delegcnt++; } else { /* - * Delegation already exists, what do we do if a new one?? + * A delegation already exists. If the new one is a Write + * delegation and the old one a Read delegation, return the + * Read delegation. Otherwise, return the new delegation. */ if (dp != NULL) { - printf("Deleg already exists!\n"); - free(dp, M_NFSCLDELEG); - *dpp = NULL; + if ((dp->nfsdl_flags & NFSCLDL_WRITE) != 0 && + (tdp->nfsdl_flags & NFSCLDL_READ) != 0) { + TAILQ_REMOVE(&clp->nfsc_deleg, tdp, nfsdl_list); + LIST_REMOVE(tdp, nfsdl_hash); + *dpp = NULL; + TAILQ_INSERT_HEAD(&clp->nfsc_deleg, dp, + nfsdl_list); + LIST_INSERT_HEAD(NFSCLDELEGHASH(clp, nfhp, + fhlen), dp, nfsdl_hash); + dp->nfsdl_timestamp = NFSD_MONOSEC + 120; + } else { + *dpp = NULL; + tdp = dp; /* Return this one. */ + } } else { *dpp = tdp; + tdp = NULL; } } NFSUNLOCKCLSTATE(); + if (tdp != NULL) { + nfscl_trydelegreturn(tdp, cred, nmp, p); + free(tdp, M_NFSCLDELEG); + } return (0); } diff --git a/sys/fs/nfsclient/nfs_clsubs.c b/sys/fs/nfsclient/nfs_clsubs.c index 80ab979d22d7..8bb51e29e1d1 100644 --- a/sys/fs/nfsclient/nfs_clsubs.c +++ b/sys/fs/nfsclient/nfs_clsubs.c @@ -188,7 +188,7 @@ ncl_getattrcache(struct vnode *vp, struct vattr *vaper) np = VTONFS(vp); vap = &np->n_vattr.na_vattr; nmp = VFSTONFS(vp->v_mount); - mustflush = nfscl_mustflush(vp); /* must be before mtx_lock() */ + mustflush = nfscl_nodeleg(vp, 0); /* must be before mtx_lock() */ NFSLOCKNODE(np); /* XXX n_mtime doesn't seem to be updated on a miss-and-reload */ timeo = (time_second - np->n_mtime.tv_sec) / 10; @@ -221,8 +221,8 @@ ncl_getattrcache(struct vnode *vp, struct vattr *vaper) (time_second - np->n_attrstamp), timeo); #endif - if ((time_second - np->n_attrstamp) >= timeo && - (mustflush != 0 || np->n_attrstamp == 0)) { + if (mustflush != 0 && (np->n_attrstamp == 0 || + time_second - np->n_attrstamp >= timeo)) { nfsstatsv1.attrcache_misses++; NFSUNLOCKNODE(np); KDTRACE_NFS_ATTRCACHE_GET_MISS(vp); diff --git a/sys/fs/nfsclient/nfs_clvnops.c b/sys/fs/nfsclient/nfs_clvnops.c index 76a3cdf9281e..13341dfc26e0 100644 --- a/sys/fs/nfsclient/nfs_clvnops.c +++ b/sys/fs/nfsclient/nfs_clvnops.c @@ -940,7 +940,7 @@ nfs_close(struct vop_close_args *ap) /* * Get attributes so "change" is up to date. */ - if (error == 0 && nfscl_mustflush(vp) != 0 && + if (error == 0 && nfscl_nodeleg(vp, 0) != 0 && vp->v_type == VREG && (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOCTO) == 0) { ret = nfsrpc_getattr(vp, cred, ap->a_td, &nfsva); From a16cb8709de7ddc6dd6ab71918af9fc5fd96f377 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Wed, 12 Jun 2024 20:08:04 -0400 Subject: [PATCH 55/91] tzsetup: Correct UTC description UTC is Coordinated Universal Time, not Greenwich Mean Time. Reviewed by: imp, allanjude MFC after: 1 week Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D45573 --- usr.sbin/tzsetup/tzsetup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usr.sbin/tzsetup/tzsetup.c b/usr.sbin/tzsetup/tzsetup.c index fee5762b6fa6..617de4efb765 100644 --- a/usr.sbin/tzsetup/tzsetup.c +++ b/usr.sbin/tzsetup/tzsetup.c @@ -957,7 +957,7 @@ main(int argc, char **argv) "If it is set to local time,\n" "or you don't know, please choose NO here!"); - conf.title = "Select local or UTC (Greenwich Mean Time) clock"; + conf.title = "Select local or UTC (Coordinated Universal Time) clock"; if (bsddialog_yesno(&conf, prompt, 7, 73) == BSDDIALOG_YES) { if (reallydoit) unlink(path_wall_cmos_clock); From c0d0bc2bed8003d2f2b24c3c29ce971ca8dfc556 Mon Sep 17 00:00:00 2001 From: Doug Moore Date: Thu, 13 Jun 2024 11:44:38 -0500 Subject: [PATCH 56/91] subr_pctrie: add leaf callbacks to pctrie_reclaim PCTRIE_RECLAIM frees all the interior nodes in a pctrie, but is little used because most trie-destroyers want to free leaves of the tree too. Add PCTRIE_RECLAIM_CALLBACK, with two extra arguments, a callback function and an auxiliary argument, that is invoked on every non-NULL leaf in the tree as the tree is destroyed. Reviewed by: rlibby, kib (previous version) Differential Revision: https://reviews.freebsd.org/D45565 --- sys/kern/subr_pctrie.c | 78 ++++++++++++++++++++++++++++++++---------- sys/sys/pctrie.h | 25 ++++++++++++++ 2 files changed, 85 insertions(+), 18 deletions(-) diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c index 4017f98c207d..347c2bffd503 100644 --- a/sys/kern/subr_pctrie.c +++ b/sys/kern/subr_pctrie.c @@ -198,7 +198,6 @@ pctrie_root_store(struct pctrie *ptree, struct pctrie_node *node, static __inline bool pctrie_isleaf(struct pctrie_node *node) { - return (((uintptr_t)node & PCTRIE_ISLEAF) != 0); } @@ -217,10 +216,18 @@ pctrie_toleaf(uint64_t *val) static __inline uint64_t * pctrie_toval(struct pctrie_node *node) { - return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS)); } +/* + * Returns the associated pointer extracted from node and field offset. + */ +static __inline void * +pctrie_toptr(struct pctrie_node *node, int keyoff) +{ + return ((void *)(((uintptr_t)node & ~PCTRIE_FLAGS) - keyoff)); +} + /* * Make 'child' a child of 'node'. */ @@ -792,14 +799,14 @@ pctrie_remove_lookup(struct pctrie *ptree, uint64_t index, } /* - * Prune all the leaves of 'node' before its first non-leaf child, make child - * zero of 'node' point up to 'parent', make 'node' into 'parent' and that - * non-leaf child into 'node'. Repeat until a node has been stripped of all - * children, and mark it for freeing, returning its parent. + * Walk the subtrie rooted at *pnode in order, invoking callback on leaves and + * using the leftmost child pointer for path reversal, until an interior node + * is stripped of all children, and returned for deallocation, with *pnode left + * pointing the parent of that node. */ -static struct pctrie_node * -pctrie_reclaim_prune(struct pctrie_node **pnode, - struct pctrie_node *parent) +static __always_inline struct pctrie_node * +pctrie_reclaim_prune(struct pctrie_node **pnode, struct pctrie_node *parent, + pctrie_cb_t callback, int keyoff, void *arg) { struct pctrie_node *child, *node; int slot; @@ -812,8 +819,11 @@ pctrie_reclaim_prune(struct pctrie_node **pnode, PCTRIE_UNSERIALIZED); pctrie_node_store(&node->pn_child[slot], PCTRIE_NULL, PCTRIE_UNSERIALIZED); - if (pctrie_isleaf(child)) + if (pctrie_isleaf(child)) { + if (callback != NULL) + callback(pctrie_toptr(child, keyoff), arg); continue; + } /* Climb one level down the trie. */ pctrie_node_store(&node->pn_child[0], parent, PCTRIE_UNSERIALIZED); @@ -827,8 +837,9 @@ pctrie_reclaim_prune(struct pctrie_node **pnode, /* * Recover the node parent from its first child and continue pruning. */ -struct pctrie_node * -pctrie_reclaim_resume(struct pctrie_node **pnode) +static __always_inline struct pctrie_node * +pctrie_reclaim_resume_compound(struct pctrie_node **pnode, + pctrie_cb_t callback, int keyoff, void *arg) { struct pctrie_node *parent, *node; @@ -839,24 +850,55 @@ pctrie_reclaim_resume(struct pctrie_node **pnode) parent = pctrie_node_load(&node->pn_child[0], NULL, PCTRIE_UNSERIALIZED); pctrie_node_store(&node->pn_child[0], PCTRIE_NULL, PCTRIE_UNSERIALIZED); - return (pctrie_reclaim_prune(pnode, parent)); + return (pctrie_reclaim_prune(pnode, parent, callback, keyoff, arg)); } /* * Find the trie root, and start pruning with a NULL parent. */ -struct pctrie_node * -pctrie_reclaim_begin(struct pctrie_node **pnode, - struct pctrie *ptree) +static __always_inline struct pctrie_node * +pctrie_reclaim_begin_compound(struct pctrie_node **pnode, + struct pctrie *ptree, + pctrie_cb_t callback, int keyoff, void *arg) { struct pctrie_node *node; node = pctrie_root_load(ptree, NULL, PCTRIE_UNSERIALIZED); pctrie_root_store(ptree, PCTRIE_NULL, PCTRIE_UNSERIALIZED); - if (pctrie_isleaf(node)) + if (pctrie_isleaf(node)) { + if (callback != NULL && node != PCTRIE_NULL) + callback(pctrie_toptr(node, keyoff), arg); return (NULL); + } *pnode = node; - return (pctrie_reclaim_prune(pnode, NULL)); + return (pctrie_reclaim_prune(pnode, NULL, callback, keyoff, arg)); +} + +struct pctrie_node * +pctrie_reclaim_resume(struct pctrie_node **pnode) +{ + return (pctrie_reclaim_resume_compound(pnode, NULL, 0, NULL)); +} + +struct pctrie_node * +pctrie_reclaim_begin(struct pctrie_node **pnode, struct pctrie *ptree) +{ + return (pctrie_reclaim_begin_compound(pnode, ptree, NULL, 0, NULL)); +} + +struct pctrie_node * +pctrie_reclaim_resume_cb(struct pctrie_node **pnode, + pctrie_cb_t callback, int keyoff, void *arg) +{ + return (pctrie_reclaim_resume_compound(pnode, callback, keyoff, arg)); +} + +struct pctrie_node * +pctrie_reclaim_begin_cb(struct pctrie_node **pnode, struct pctrie *ptree, + pctrie_cb_t callback, int keyoff, void *arg) +{ + return (pctrie_reclaim_begin_compound(pnode, ptree, + callback, keyoff, arg)); } /* diff --git a/sys/sys/pctrie.h b/sys/sys/pctrie.h index 06b9fca79528..4e1d8c7f8617 100644 --- a/sys/sys/pctrie.h +++ b/sys/sys/pctrie.h @@ -36,6 +36,8 @@ #ifdef _KERNEL +typedef void (*pctrie_cb_t)(void *ptr, void *arg); + #define PCTRIE_DEFINE_SMR(name, type, field, allocfn, freefn, smr) \ PCTRIE_DEFINE(name, type, field, allocfn, freefn) \ \ @@ -218,6 +220,24 @@ name##_PCTRIE_RECLAIM(struct pctrie *ptree) \ freefn(ptree, freenode); \ } \ \ +/* \ + * While reclaiming all internal trie nodes, invoke callback(leaf, arg) \ + * on every leaf in the trie, in order. \ + */ \ +static __inline __unused void \ +name##_PCTRIE_RECLAIM_CALLBACK(struct pctrie *ptree, \ + pctrie_cb_t callback, void *arg) \ +{ \ + struct pctrie_node *freenode, *node; \ + \ + for (freenode = pctrie_reclaim_begin_cb(&node, ptree, \ + callback, __offsetof(struct type, field), arg); \ + freenode != NULL; \ + freenode = pctrie_reclaim_resume_cb(&node, \ + callback, __offsetof(struct type, field), arg)) \ + freefn(ptree, freenode); \ +} \ + \ static __inline __unused struct type * \ name##_PCTRIE_REPLACE(struct pctrie *ptree, struct type *ptr) \ { \ @@ -269,6 +289,11 @@ uint64_t *pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t key, struct pctrie_node *pctrie_reclaim_begin(struct pctrie_node **pnode, struct pctrie *ptree); struct pctrie_node *pctrie_reclaim_resume(struct pctrie_node **pnode); +struct pctrie_node *pctrie_reclaim_begin_cb(struct pctrie_node **pnode, + struct pctrie *ptree, + pctrie_cb_t callback, int keyoff, void *arg); +struct pctrie_node *pctrie_reclaim_resume_cb(struct pctrie_node **pnode, + pctrie_cb_t callback, int keyoff, void *arg); uint64_t *pctrie_remove_lookup(struct pctrie *ptree, uint64_t index, struct pctrie_node **killnode); uint64_t *pctrie_replace(struct pctrie *ptree, uint64_t *newval); From d2acf0a447093dcf0805c75490e1cd989bc8c36c Mon Sep 17 00:00:00 2001 From: Doug Moore Date: Thu, 13 Jun 2024 13:47:54 -0500 Subject: [PATCH 57/91] swap_pager: pctrie_reclaim_cb in meta_free_all Replace the lookup-remove loop in swp_pager_meta_free_all with a call to SWAP_PCTRIE_RECLAIM_CALLBACK, to eliminate repeated trie searches. Reviewed by: rlibby Differential Revision: https://reviews.freebsd.org/D45583 --- sys/vm/swap_pager.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 3bfda3eea169..c339f70ddea1 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -2220,8 +2220,11 @@ swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count, } static void -swp_pager_meta_free_block(struct swblk *sb, struct page_range *range) +swp_pager_meta_free_block(void *sbv, void *rangev) { + struct swblk *sb = sbv; + struct page_range *range = rangev; + for (int i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) swp_pager_update_freerange(range, sb->d[i]); @@ -2238,19 +2241,13 @@ swp_pager_meta_free_block(struct swblk *sb, struct page_range *range) static void swp_pager_meta_free_all(vm_object_t object) { - struct swblk *sb; struct page_range range; - vm_pindex_t pindex; VM_OBJECT_ASSERT_WLOCKED(object); swp_pager_init_freerange(&range); - for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( - &object->un_pager.swp.swp_blks, pindex)) != NULL;) { - pindex = sb->p + SWAP_META_PAGES; - SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); - swp_pager_meta_free_block(sb, &range); - } + SWAP_PCTRIE_RECLAIM_CALLBACK(&object->un_pager.swp.swp_blks, + swp_pager_meta_free_block, &range); swp_pager_freeswapspace(&range); } From a7f67ebd8275165b2f6099de20790e112f8b1433 Mon Sep 17 00:00:00 2001 From: Doug Moore Date: Thu, 13 Jun 2024 13:49:43 -0500 Subject: [PATCH 58/91] subr_rangeset: use pctrie_reclaim_cb in remove_all Replace the lookup-remove loop in rangeet_remove_all with a call to SWAP_PCTRIE_RECLAIM_CALLBACK, to eliminate repeated trie searches. Reviewed by: rlibby Differential Revision: https://reviews.freebsd.org/D45584 --- sys/kern/subr_rangeset.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/sys/kern/subr_rangeset.c b/sys/kern/subr_rangeset.c index 0a675b4a2fce..94b77a449d22 100644 --- a/sys/kern/subr_rangeset.c +++ b/sys/kern/subr_rangeset.c @@ -232,18 +232,20 @@ rangeset_remove(struct rangeset *rs, uint64_t start, uint64_t end) return (rangeset_remove_pred(rs, start, end, rangeset_true_pred)); } +static void +rangeset_remove_leaf(void *rv, void *rsv) +{ + struct rs_el *r = rv; + struct rangeset *rs = rsv; + + rs->rs_free_data(rs->rs_data_ctx, r); +} + void rangeset_remove_all(struct rangeset *rs) { - struct rs_el *r; - - for (;;) { - r = RANGESET_PCTRIE_LOOKUP_GE(&rs->rs_trie, 0); - if (r == NULL) - break; - RANGESET_PCTRIE_REMOVE(&rs->rs_trie, r->re_start); - rs->rs_free_data(rs->rs_data_ctx, r); - } + RANGESET_PCTRIE_RECLAIM_CALLBACK(&rs->rs_trie, + rangeset_remove_leaf, rs); } void * From 268f19aacc6af8f64c438e8515213023a2e66ed7 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Sun, 9 Jun 2024 11:58:27 -0500 Subject: [PATCH 59/91] vm: Reduce address space fragmentation jemalloc performs two types of virtual memory allocations: (1) large chunks of virtual memory, where the chunk size is a multiple of a superpage and explicitly aligned, and (2) small allocations, mostly 128KB, where no alignment is requested. Typically, it starts with a small allocation, and over time it makes both types of allocation. With anon_loc being updated on every allocation, we wind up with a repeating pattern of a small allocation, a large gap, and a large, aligned allocation. (As an aside, we wind up allocating a reservation for these small allocations, but it will never fill because the next large, aligned allocation updates anon_loc, leaving a gap that will never be filled with other small allocations.) With this change, anon_loc isn't updated on every allocation. So, the small allocations will be clustered together, the large allocations will be clustered together, and there will be fewer gaps between the anonymous memory allocations. In addition, I see a small reduction in reservations allocated (e.g., 1.6% during buildworld), fewer partially populated reservations, and a small increase in 64KB page promotions on arm64. Reviewed by: kib MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D39845 --- sys/vm/vm_map.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 3c7afcb6642f..fa71bb8a01d6 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -2247,8 +2247,15 @@ vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, rv = vm_map_insert(map, object, offset, *addr, *addr + length, prot, max, cow); } - if (rv == KERN_SUCCESS && update_anon) - map->anon_loc = *addr + length; + + /* + * Update the starting address for clustered anonymous memory mappings + * if a starting address was not previously defined or an ASLR restart + * placed an anonymous memory mapping at a lower address. + */ + if (update_anon && rv == KERN_SUCCESS && (map->anon_loc == 0 || + *addr < map->anon_loc)) + map->anon_loc = *addr; done: vm_map_unlock(map); return (rv); @@ -4041,9 +4048,6 @@ vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) entry->object.vm_object != NULL) pmap_map_delete(map->pmap, entry->start, entry->end); - if (entry->end == map->anon_loc) - map->anon_loc = entry->start; - /* * Delete the entry only after removing all pmap * entries pointing to its pages. (Otherwise, its From d19851f002862a5510bf31fae4083fab979258be Mon Sep 17 00:00:00 2001 From: Doug Moore Date: Thu, 13 Jun 2024 15:28:15 -0500 Subject: [PATCH 60/91] subr_pctrie: add a word to a comment No functional changes. Reported by: alc --- sys/kern/subr_pctrie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c index 347c2bffd503..6949e3de99bf 100644 --- a/sys/kern/subr_pctrie.c +++ b/sys/kern/subr_pctrie.c @@ -802,7 +802,7 @@ pctrie_remove_lookup(struct pctrie *ptree, uint64_t index, * Walk the subtrie rooted at *pnode in order, invoking callback on leaves and * using the leftmost child pointer for path reversal, until an interior node * is stripped of all children, and returned for deallocation, with *pnode left - * pointing the parent of that node. + * pointing to the parent of that node. */ static __always_inline struct pctrie_node * pctrie_reclaim_prune(struct pctrie_node **pnode, struct pctrie_node *parent, From d9220f64da8b1d55b15259e8b266b50371ed4f47 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 13 Jun 2024 20:00:47 -0400 Subject: [PATCH 61/91] uniq tests: Make uniq_test:stdout more reliable When running regression tests in paralle, this one occasionally fails because uniq exits with status 0. I believe this is because the test is a bit racy: it assumes that true(1) will exit before uniq writes to standard out. Just sleep for a bit to give the other end of the pipe to exit. Reviewed by: des MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D45534 --- usr.bin/uniq/tests/uniq_test.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/usr.bin/uniq/tests/uniq_test.sh b/usr.bin/uniq/tests/uniq_test.sh index 804e82ce7766..9d37245dee4e 100755 --- a/usr.bin/uniq/tests/uniq_test.sh +++ b/usr.bin/uniq/tests/uniq_test.sh @@ -167,6 +167,8 @@ stdout_head() { stdout_body() { ( trap "" PIPE + # Give true(1) some time to exit. + sleep 1 echo a | uniq 2>stderr echo $? >result ) | true From ab250b02ba4ca29572a29dedc9e6276cba5a66db Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 13 Jun 2024 20:02:32 -0400 Subject: [PATCH 62/91] bnxt: Use a simpler test for 32-bit platforms Suggested by: jrtc27 Fixes: c867ba72889d ("bnxt: Do not compile on 32-bit platforms") --- sys/modules/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sys/modules/Makefile b/sys/modules/Makefile index d3ef4c63a41c..7ce956957fd5 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -442,8 +442,7 @@ SUBDIR+= dtrace SUBDIR+= opensolaris .endif -.if ${MACHINE_ARCH} != "i386" && ${MACHINE_CPUARCH} != "arm" && \ - ${MACHINE_ARCH} != "powerpc" && ${MACHINE_ARCH} != "powerpcspe" +.if !${MACHINE_ABI:Mlong32} _bnxt= bnxt .endif From 69ccea1c89da82cbb772ac4cca62c1067346fe95 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 13 Jun 2024 20:11:03 -0400 Subject: [PATCH 63/91] vm_page: Let vm_page_init_page() take a pool parameter This is useful for a subsequent patch which implements lazy initialization of vm_page structures using a dedicate vm_phys free page pool. No functional change intended. Reviewed by: alc, kib, emaste MFC after: 2 weeks Differential Revision: https://reviews.freebsd.org/D40399 --- sys/amd64/amd64/efirt_machdep.c | 3 ++- sys/vm/vm_page.c | 10 +++++----- sys/vm/vm_page.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sys/amd64/amd64/efirt_machdep.c b/sys/amd64/amd64/efirt_machdep.c index 5d0106e2aa88..2c00a16b1499 100644 --- a/sys/amd64/amd64/efirt_machdep.c +++ b/sys/amd64/amd64/efirt_machdep.c @@ -245,7 +245,8 @@ efi_create_1t1_map(struct efi_md *map, int ndesc, int descsz) m = PHYS_TO_VM_PAGE(va); if (m != NULL && VM_PAGE_TO_PHYS(m) == 0) { - vm_page_init_page(m, va, -1); + vm_page_init_page(m, va, -1, + VM_FREEPOOL_DEFAULT); m->order = VM_NFREEORDER + 1; /* invalid */ m->pool = VM_NFREEPOOL + 1; /* invalid */ pmap_page_set_memattr_noflush(m, mode); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 4b97637668b4..3077b023754a 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -497,9 +497,8 @@ vm_page_domain_init(int domain) * lists. */ void -vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) +vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind, int pool) { - m->object = NULL; m->ref_count = 0; m->busy_lock = VPB_FREED; @@ -509,7 +508,7 @@ vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) m->psind = 0; m->segind = segind; m->order = VM_NFREEORDER; - m->pool = VM_FREEPOOL_DEFAULT; + m->pool = pool; m->valid = m->dirty = 0; pmap_page_init(m); } @@ -756,7 +755,8 @@ vm_page_startup(vm_offset_t vaddr) #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) for (ii = 0; ii < vm_page_array_size; ii++) { m = &vm_page_array[ii]; - vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0); + vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0, + VM_FREEPOOL_DEFAULT); m->flags = PG_FICTITIOUS; } #endif @@ -765,7 +765,7 @@ vm_page_startup(vm_offset_t vaddr) seg = &vm_phys_segs[segind]; for (m = seg->first_page, pa = seg->start; pa < seg->end; m++, pa += PAGE_SIZE) - vm_page_init_page(m, pa, segind); + vm_page_init_page(m, pa, segind, VM_FREEPOOL_DEFAULT); /* * Add the segment's pages that are covered by one of diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 221f58b8fb04..5422f8df89a0 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -645,7 +645,7 @@ void vm_page_free_invalid(vm_page_t); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags); -void vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind); +void vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind, int pool); int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); void vm_page_invalid(vm_page_t m); void vm_page_launder(vm_page_t m); From d7ec4a8859b44beb9ef69620134ba5c5ce4d598c Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 13 Jun 2024 20:11:15 -0400 Subject: [PATCH 64/91] vm_phys: Factor out some calls to vm_freelist_add() A subsequent patch will make this factoring more worthwhile. No functional change intended. Reviewed by: dougm, alc, kib, emaste MFC after: 2 weeks Differential Revision: https://reviews.freebsd.org/D40400 --- sys/vm/vm_phys.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index 62e84a5b52bd..6323fb484030 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -680,6 +680,15 @@ vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, } } +static void +vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int tail) +{ + KASSERT(order >= 0 && order < VM_NFREEORDER, + ("%s: invalid order %d", __func__, order)); + + vm_freelist_add(fl, m, order, tail); +} + /* * Add the physical pages [m, m + npages) at the beginning of a power-of-two * aligned and sized set to the specified free list. @@ -706,13 +715,13 @@ vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) KASSERT(m->order == VM_NFREEORDER, ("%s: page %p has unexpected order %d", __func__, m, m->order)); - order = ilog2(npages); + order = ilog2(npages); KASSERT(order < VM_NFREEORDER, ("%s: order %d is out of range", __func__, order)); - vm_freelist_add(fl, m, order, tail); + vm_phys_enq_chunk(fl, m, order, tail); m += 1 << order; - npages -= 1 << order; - } + npages -= 1 << order; + } } /* @@ -743,9 +752,7 @@ vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) ("vm_phys_enq_range: page %p has unexpected order %d", m, m->order)); order = ffs(npages) - 1; - KASSERT(order < VM_NFREEORDER, - ("vm_phys_enq_range: order %d is out of range", order)); - vm_freelist_add(fl, m, order, tail); + vm_phys_enq_chunk(fl, m, order, tail); m += 1 << order; npages -= 1 << order; } @@ -1203,7 +1210,7 @@ vm_phys_enqueue_contig(vm_page_t m, u_long npages) KASSERT(seg == &vm_phys_segs[m->segind], ("%s: page range [%p,%p) spans multiple segments", __func__, m_end - npages, m)); - vm_freelist_add(fl, m, order, 1); + vm_phys_enq_chunk(fl, m, order, 1); m += 1 << order; } /* Free blocks of diminishing size. */ From 69cbb18746b69cbcdf79f1728d0435a1c86fff58 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 13 Jun 2024 20:11:36 -0400 Subject: [PATCH 65/91] vm_phys: Add a vm_phys_seg_paddr_to_vm_page() helper No functional change intended. Suggested by: alc Reviewed by: dougm, alc, kib MFC after: 2 weeks Differential Revision: https://reviews.freebsd.org/D43636 --- sys/vm/vm_page.c | 2 +- sys/vm/vm_phys.c | 29 +++++++++++++++++++++-------- sys/vm/vm_phys.h | 1 + 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 3077b023754a..37149cb660cb 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -782,7 +782,7 @@ vm_page_startup(vm_offset_t vaddr) if (pagecount == 0) continue; - m = seg->first_page + atop(startp - seg->start); + m = vm_phys_seg_paddr_to_vm_page(seg, startp); vmd = VM_DOMAIN(seg->domain); vm_domain_free_lock(vmd); vm_phys_enqueue_contig(m, pagecount); diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index 6323fb484030..35e544708c84 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -932,6 +932,19 @@ vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) return (NULL); } +/* + * Find the vm_page corresponding to the given physical address, which must lie + * within the given physical memory segment. + */ +vm_page_t +vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa) +{ + KASSERT(pa >= seg->start && pa < seg->end, + ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa)); + + return (&seg->first_page[atop(pa - seg->start)]); +} + /* * Find the vm_page corresponding to the given physical address. */ @@ -941,7 +954,7 @@ vm_phys_paddr_to_vm_page(vm_paddr_t pa) struct vm_phys_seg *seg; if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) - return (&seg->first_page[atop(pa - seg->start)]); + return (vm_phys_seg_paddr_to_vm_page(seg, pa)); return (NULL); } @@ -1157,7 +1170,7 @@ vm_phys_free_pages(vm_page_t m, int order) pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); if (pa < seg->start || pa >= seg->end) break; - m_buddy = &seg->first_page[atop(pa - seg->start)]; + m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa); if (m_buddy->order != order) break; fl = (*seg->free_queues)[m_buddy->pool]; @@ -1166,7 +1179,7 @@ vm_phys_free_pages(vm_page_t m, int order) vm_phys_set_pool(m->pool, m_buddy, order); order++; pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); - m = &seg->first_page[atop(pa - seg->start)]; + m = vm_phys_seg_paddr_to_vm_page(seg, pa); } while (order < VM_NFREEORDER - 1); } fl = (*seg->free_queues)[m->pool]; @@ -1278,8 +1291,8 @@ vm_phys_find_range(vm_page_t bounds[], int segind, int domain, pa_end = MIN(high, seg->end); if (pa_end - pa_start < ptoa(npages)) continue; - bounds[0] = &seg->first_page[atop(pa_start - seg->start)]; - bounds[1] = &seg->first_page[atop(pa_end - seg->start)]; + bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start); + bounds[1] = vm_phys_seg_paddr_to_vm_page(seg, pa_end); return (seg - vm_phys_segs); } return (-1); @@ -1313,7 +1326,7 @@ vm_phys_unfree_page(vm_page_t m) order++; pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); if (pa >= seg->start) - m_set = &seg->first_page[atop(pa - seg->start)]; + m_set = vm_phys_seg_paddr_to_vm_page(seg, pa); else return (false); } @@ -1338,10 +1351,10 @@ vm_phys_unfree_page(vm_page_t m) order--; pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); if (m->phys_addr < pa_half) - m_tmp = &seg->first_page[atop(pa_half - seg->start)]; + m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half); else { m_tmp = m_set; - m_set = &seg->first_page[atop(pa_half - seg->start)]; + m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half); } vm_freelist_add(fl, m_tmp, order, 0); } diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h index 1f502ad745c4..300443accdb6 100644 --- a/sys/vm/vm_phys.h +++ b/sys/vm/vm_phys.h @@ -77,6 +77,7 @@ void vm_phys_free_contig(vm_page_t m, u_long npages); void vm_phys_free_pages(vm_page_t m, int order); void vm_phys_init(void); vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa); +vm_page_t vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa); void vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, int *locality); bool vm_phys_unfree_page(vm_page_t m); From b16b4c22d2d19bfb3d8e0fcce9e83b9d69f915e8 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 13 Jun 2024 20:11:47 -0400 Subject: [PATCH 66/91] vm_page: Implement lazy page initialization FreeBSD's boot times have decreased to the point where vm_page array initialization represents a significant fraction of the total boot time. For example, when booting FreeBSD in Firecracker (a VMM designed to support lightweight VMs) with 128MB and 1GB of RAM, vm_page initialization consumes 9% (3ms) and 37% (21.5ms) of the kernel boot time, respectively. This is generally relevant in cloud environments, where one wants to be able to spin up VMs as quickly as possible. This patch implements lazy initialization of (most) page structures, following a suggestion from cperciva@. The idea is to introduce a new free pool, VM_FREEPOOL_LAZYINIT, into which all vm_page structures are initially placed. For this to work, we need only initialize the first free page of each chunk placed into the buddy allocator. Then, early page allocations draw from the lazy init pool and initialize vm_page chunks (up to 16MB, 4096 pages) on demand. Once APs are started, an idle-priority thread drains the lazy init pool in the background to avoid introducing extra latency in the allocator. With this scheme, almost all of the initialization work is moved out of the critical path. A couple of vm_phys operations require the pool to be drained before they can run: vm_phys_find_range() and vm_phys_unfree_page(). However, these are rare operations. I believe that vm_phys_find_freelist_contig() does not require any special treatment, as it only ever accesses the first page in a power-of-2-sized free page chunk, which is always initialized. For now the new pool is only used on amd64 and arm64, since that's where I can easily test and those platforms would get the most benefit. Reviewed by: alc, kib Differential Revision: https://reviews.freebsd.org/D40403 --- sys/amd64/include/vmparam.h | 16 +-- sys/arm64/include/vmparam.h | 16 +-- sys/vm/vm_page.c | 65 +++++++++++- sys/vm/vm_phys.c | 201 +++++++++++++++++++++++++++++++++--- sys/vm/vm_phys.h | 2 +- 5 files changed, 263 insertions(+), 37 deletions(-) diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h index 93c2648e8fac..2ffa51d9021c 100644 --- a/sys/amd64/include/vmparam.h +++ b/sys/amd64/include/vmparam.h @@ -94,14 +94,16 @@ #define VM_PHYSSEG_MAX 63 /* - * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool - * from which physical pages are allocated and VM_FREEPOOL_DIRECT is - * the pool from which physical pages for page tables and small UMA - * objects are allocated. + * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool from + * which physical pages are allocated and VM_FREEPOOL_DIRECT is the pool from + * which physical pages for page tables and small UMA objects are allocated. + * VM_FREEPOOL_LAZINIT is a special-purpose pool that is populated only during + * boot and is used to implement deferred initialization of page structures. */ -#define VM_NFREEPOOL 2 -#define VM_FREEPOOL_DEFAULT 0 -#define VM_FREEPOOL_DIRECT 1 +#define VM_NFREEPOOL 3 +#define VM_FREEPOOL_LAZYINIT 0 +#define VM_FREEPOOL_DEFAULT 1 +#define VM_FREEPOOL_DIRECT 2 /* * Create up to three free page lists: VM_FREELIST_DMA32 is for physical pages diff --git a/sys/arm64/include/vmparam.h b/sys/arm64/include/vmparam.h index 0e93e4026d4a..08a3b18da64e 100644 --- a/sys/arm64/include/vmparam.h +++ b/sys/arm64/include/vmparam.h @@ -73,14 +73,16 @@ #define VM_PHYSSEG_MAX 64 /* - * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool - * from which physical pages are allocated and VM_FREEPOOL_DIRECT is - * the pool from which physical pages for small UMA objects are - * allocated. + * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool from + * which physical pages are allocated and VM_FREEPOOL_DIRECT is the pool from + * which physical pages for page tables and small UMA objects are allocated. + * VM_FREEPOOL_LAZINIT is a special-purpose pool that is populated only during + * boot and is used to implement deferred initialization of page structures. */ -#define VM_NFREEPOOL 2 -#define VM_FREEPOOL_DEFAULT 0 -#define VM_FREEPOOL_DIRECT 1 +#define VM_NFREEPOOL 3 +#define VM_FREEPOOL_LAZYINIT 0 +#define VM_FREEPOOL_DEFAULT 1 +#define VM_FREEPOOL_DIRECT 2 /* * Create two free page lists: VM_FREELIST_DMA32 is for physical pages that have diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 37149cb660cb..8e94c02089ae 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -333,9 +333,9 @@ vm_page_blacklist_add(vm_paddr_t pa, bool verbose) if (m == NULL) return (true); /* page does not exist, no failure */ - vmd = vm_pagequeue_domain(m); + vmd = VM_DOMAIN(vm_phys_domain(pa)); vm_domain_free_lock(vmd); - found = vm_phys_unfree_page(m); + found = vm_phys_unfree_page(pa); vm_domain_free_unlock(vmd); if (found) { vm_domain_freecnt_inc(vmd, -1); @@ -568,6 +568,9 @@ vm_page_startup(vm_offset_t vaddr) #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) long ii; #endif +#ifdef VM_FREEPOOL_LAZYINIT + int lazyinit; +#endif vaddr = round_page(vaddr); @@ -748,6 +751,11 @@ vm_page_startup(vm_offset_t vaddr) */ vm_phys_init(); +#ifdef VM_FREEPOOL_LAZYINIT + lazyinit = 1; + TUNABLE_INT_FETCH("debug.vm.lazy_page_init", &lazyinit); +#endif + /* * Initialize the page structures and add every available page to the * physical memory allocator's free lists. @@ -763,9 +771,50 @@ vm_page_startup(vm_offset_t vaddr) vm_cnt.v_page_count = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; - for (m = seg->first_page, pa = seg->start; pa < seg->end; - m++, pa += PAGE_SIZE) - vm_page_init_page(m, pa, segind, VM_FREEPOOL_DEFAULT); + + /* + * If lazy vm_page initialization is not enabled, simply + * initialize all of the pages in the segment. Otherwise, we + * only initialize: + * 1. Pages not covered by phys_avail[], since they might be + * freed to the allocator at some future point, e.g., by + * kmem_bootstrap_free(). + * 2. The first page of each run of free pages handed to the + * vm_phys allocator, which in turn defers initialization + * of pages until they are needed. + * This avoids blocking the boot process for long periods, which + * may be relevant for VMs (which ought to boot as quickly as + * possible) and/or systems with large amounts of physical + * memory. + */ +#ifdef VM_FREEPOOL_LAZYINIT + if (lazyinit) { + startp = seg->start; + for (i = 0; phys_avail[i + 1] != 0; i += 2) { + if (startp >= seg->end) + break; + + if (phys_avail[i + 1] < startp) + continue; + if (phys_avail[i] <= startp) { + startp = phys_avail[i + 1]; + continue; + } + + m = vm_phys_seg_paddr_to_vm_page(seg, startp); + for (endp = MIN(phys_avail[i], seg->end); + startp < endp; startp += PAGE_SIZE, m++) { + vm_page_init_page(m, startp, segind, + VM_FREEPOOL_DEFAULT); + } + } + } else +#endif + for (m = seg->first_page, pa = seg->start; + pa < seg->end; m++, pa += PAGE_SIZE) { + vm_page_init_page(m, pa, segind, + VM_FREEPOOL_DEFAULT); + } /* * Add the segment's pages that are covered by one of @@ -783,6 +832,12 @@ vm_page_startup(vm_offset_t vaddr) continue; m = vm_phys_seg_paddr_to_vm_page(seg, startp); +#ifdef VM_FREEPOOL_LAZYINIT + if (lazyinit) { + vm_page_init_page(m, startp, segind, + VM_FREEPOOL_LAZYINIT); + } +#endif vmd = VM_DOMAIN(seg->domain); vm_domain_free_lock(vmd); vm_phys_enqueue_contig(m, pagecount); diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index 35e544708c84..a28b3a40691e 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -47,14 +47,18 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include +#include +#include #include #include @@ -141,6 +145,7 @@ vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; * Provides the mapping from VM_FREELIST_* to free list indices (flind). */ static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; +static int __read_mostly vm_default_freepool; CTASSERT(VM_FREELIST_DEFAULT == 0); @@ -184,6 +189,16 @@ static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, int tail); +static bool __diagused +vm_phys_pool_valid(int pool) +{ +#ifdef VM_FREEPOOL_LAZYINIT + if (pool == VM_FREEPOOL_LAZYINIT) + return (false); +#endif + return (pool >= 0 && pool < VM_NFREEPOOL); +} + /* * Red-black tree helpers for vm fictitious range management. */ @@ -621,6 +636,12 @@ vm_phys_init(void) } } +#ifdef VM_FREEPOOL_LAZYINIT + vm_default_freepool = VM_FREEPOOL_LAZYINIT; +#else + vm_default_freepool = VM_FREEPOOL_DEFAULT; +#endif + rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); } @@ -687,6 +708,17 @@ vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int tail) ("%s: invalid order %d", __func__, order)); vm_freelist_add(fl, m, order, tail); +#ifdef VM_FREEPOOL_LAZYINIT + if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { + vm_page_t m_next; + int npages; + + npages = 1 << order; + m_next = m + npages; + vm_page_init_page(m_next, m->phys_addr + ptoa(npages), m->segind, + VM_FREEPOOL_LAZYINIT); + } +#endif } /* @@ -760,15 +792,33 @@ vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) } /* - * Set the pool for a contiguous, power of two-sized set of physical pages. + * Set the pool for a contiguous, power of two-sized set of physical pages. + * + * If the pages currently belong to the lazy init pool, then the corresponding + * page structures must be initialized. In this case it is assumed that the + * first page in the run has already been initialized. */ static void vm_phys_set_pool(int pool, vm_page_t m, int order) { - vm_page_t m_tmp; - - for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) - m_tmp->pool = pool; +#ifdef VM_FREEPOOL_LAZYINIT + if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { + vm_paddr_t pa; + int segind; + + m->pool = pool; + + TSENTER(); + pa = m->phys_addr + PAGE_SIZE; + segind = m->segind; + for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order]; + m_tmp++, pa += PAGE_SIZE) + vm_page_init_page(m_tmp, pa, segind, pool); + TSEXIT(); + } else +#endif + for (vm_page_t m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) + m_tmp->pool = pool; } /* @@ -792,7 +842,7 @@ vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) KASSERT(domain >= 0 && domain < vm_ndomains, ("vm_phys_alloc_npages: domain %d is out of range", domain)); - KASSERT(pool < VM_NFREEPOOL, + KASSERT(vm_phys_pool_valid(pool), ("vm_phys_alloc_npages: pool %d is out of range", pool)); KASSERT(npages <= 1 << (VM_NFREEORDER - 1), ("vm_phys_alloc_npages: npages %d is out of range", npages)); @@ -821,7 +871,8 @@ vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) } } for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { - for (pind = 0; pind < VM_NFREEPOOL; pind++) { + for (pind = vm_default_freepool; pind < VM_NFREEPOOL; + pind++) { alt = vm_phys_free_queues[domain][flind][pind]; while ((m = TAILQ_FIRST(&alt[oind].pl)) != NULL) { @@ -888,7 +939,7 @@ vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) KASSERT(freelist < VM_NFREELIST, ("vm_phys_alloc_freelist_pages: freelist %d is out of range", freelist)); - KASSERT(pool < VM_NFREEPOOL, + KASSERT(vm_phys_pool_valid(pool), ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); @@ -917,7 +968,7 @@ vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) * use them to satisfy the allocation. */ for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { - for (pind = 0; pind < VM_NFREEPOOL; pind++) { + for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { alt = &vm_phys_free_queues[domain][flind][pind][0]; m = TAILQ_FIRST(&alt[oind].pl); if (m != NULL) { @@ -1157,7 +1208,7 @@ vm_phys_free_pages(vm_page_t m, int order) KASSERT(m->order == VM_NFREEORDER, ("vm_phys_free_pages: page %p has unexpected order %d", m, m->order)); - KASSERT(m->pool < VM_NFREEPOOL, + KASSERT(vm_phys_pool_valid(m->pool), ("vm_phys_free_pages: page %p has unexpected pool %d", m, m->pool)); KASSERT(order < VM_NFREEORDER, @@ -1186,6 +1237,107 @@ vm_phys_free_pages(vm_page_t m, int order) vm_freelist_add(fl, m, order, 1); } +#ifdef VM_FREEPOOL_LAZYINIT +/* + * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving + * them to the default pool. This is a prerequisite for some rare operations + * which need to scan the page array and thus depend on all pages being + * initialized. + */ +static void +vm_phys_lazy_init_domain(int domain, bool locked) +{ + static bool initdone[MAXMEMDOM]; + struct vm_domain *vmd; + struct vm_freelist *fl; + vm_page_t m; + int pind; + bool unlocked; + + if (__predict_true(atomic_load_bool(&initdone[domain]))) + return; + + vmd = VM_DOMAIN(domain); + if (locked) + vm_domain_free_assert_locked(vmd); + else + vm_domain_free_lock(vmd); + if (atomic_load_bool(&initdone[domain])) + goto out; + pind = VM_FREEPOOL_LAZYINIT; + for (int freelist = 0; freelist < VM_NFREELIST; freelist++) { + int flind; + + flind = vm_freelist_to_flind[freelist]; + if (flind < 0) + continue; + fl = vm_phys_free_queues[domain][flind][pind]; + for (int oind = 0; oind < VM_NFREEORDER; oind++) { + if (atomic_load_int(&fl[oind].lcnt) == 0) + continue; + while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { + /* + * Avoid holding the lock across the + * initialization unless there's a free page + * shortage. + */ + vm_freelist_rem(fl, m, oind); + unlocked = vm_domain_allocate(vmd, + VM_ALLOC_NORMAL, 1 << oind); + if (unlocked) + vm_domain_free_unlock(vmd); + vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); + if (unlocked) { + vm_domain_freecnt_inc(vmd, 1 << oind); + vm_domain_free_lock(vmd); + } + vm_phys_free_pages(m, oind); + } + } + } + atomic_store_bool(&initdone[domain], true); +out: + if (!locked) + vm_domain_free_unlock(vmd); +} + +static void +vm_phys_lazy_init(void) +{ + for (int domain = 0; domain < vm_ndomains; domain++) + vm_phys_lazy_init_domain(domain, false); + atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT); +} + +static void +vm_phys_lazy_init_kthr(void *arg __unused) +{ + vm_phys_lazy_init(); + kthread_exit(); +} + +static void +vm_phys_lazy_sysinit(void *arg __unused) +{ + struct thread *td; + int error; + + error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td, + RFSTOPPED, 0, "vmlazyinit"); + if (error == 0) { + thread_lock(td); + sched_prio(td, PRI_MIN_IDLE); + sched_add(td, SRQ_BORING); + } else { + printf("%s: could not create lazy init thread: %d\n", + __func__, error); + vm_phys_lazy_init(); + } +} +SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit, + NULL); +#endif /* VM_FREEPOOL_LAZYINIT */ + /* * Free a contiguous, arbitrarily sized set of physical pages, without * merging across set boundaries. @@ -1291,6 +1443,12 @@ vm_phys_find_range(vm_page_t bounds[], int segind, int domain, pa_end = MIN(high, seg->end); if (pa_end - pa_start < ptoa(npages)) continue; +#ifdef VM_FREEPOOL_LAZYINIT + /* + * The pages on the free lists must be initialized. + */ + vm_phys_lazy_init_domain(domain, false); +#endif bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start); bounds[1] = vm_phys_seg_paddr_to_vm_page(seg, pa_end); return (seg - vm_phys_segs); @@ -1306,21 +1464,30 @@ vm_phys_find_range(vm_page_t bounds[], int segind, int domain, * The free page queues must be locked. */ bool -vm_phys_unfree_page(vm_page_t m) +vm_phys_unfree_page(vm_paddr_t pa) { struct vm_freelist *fl; struct vm_phys_seg *seg; - vm_paddr_t pa, pa_half; - vm_page_t m_set, m_tmp; + vm_paddr_t pa_half; + vm_page_t m, m_set, m_tmp; int order; + seg = vm_phys_paddr_to_seg(pa); + vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); + + /* + * The pages on the free lists must be initialized. + */ +#ifdef VM_FREEPOOL_LAZYINIT + vm_phys_lazy_init_domain(seg->domain, true); +#endif + /* * First, find the contiguous, power of two-sized set of free * physical pages containing the given physical page "m" and * assign it to "m_set". */ - seg = &vm_phys_segs[m->segind]; - vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); + m = vm_phys_paddr_to_vm_page(pa); for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && order < VM_NFREEORDER - 1; ) { order++; @@ -1459,7 +1626,7 @@ vm_phys_find_queues_contig( /* Search for a large enough free block. */ size = npages << PAGE_SHIFT; for (oind = order; oind < VM_NFREEORDER; oind++) { - for (pind = 0; pind < VM_NFREEPOOL; pind++) { + for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { fl = (*queues)[pind]; TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { /* @@ -1479,7 +1646,7 @@ vm_phys_find_queues_contig( if (order < VM_NFREEORDER) return (NULL); /* Search for a long-enough sequence of max-order blocks. */ - for (pind = 0; pind < VM_NFREEPOOL; pind++) { + for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { fl = (*queues)[pind]; m_ret = vm_phys_find_freelist_contig(fl, npages, low, high, alignment, boundary); diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h index 300443accdb6..bd086fd5571f 100644 --- a/sys/vm/vm_phys.h +++ b/sys/vm/vm_phys.h @@ -80,7 +80,7 @@ vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa); vm_page_t vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa); void vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, int *locality); -bool vm_phys_unfree_page(vm_page_t m); +bool vm_phys_unfree_page(vm_paddr_t pa); int vm_phys_mem_affinity(int f, int t); void vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end); vm_paddr_t vm_phys_early_alloc(int domain, size_t alloc_size); From a03354b00238b73568efe225c754cba197393f77 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 13 Jun 2024 20:16:28 -0400 Subject: [PATCH 67/91] arm64/vmm: Implement vm_disable_vcpu_creation() No functional change intended. Reviewed by: andrew Differential Revision: https://reviews.freebsd.org/D45556 --- sys/arm64/include/vmm.h | 1 + sys/arm64/vmm/vmm.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sys/arm64/include/vmm.h b/sys/arm64/include/vmm.h index c06d2ad947e4..cf00dd60a43f 100644 --- a/sys/arm64/include/vmm.h +++ b/sys/arm64/include/vmm.h @@ -127,6 +127,7 @@ struct vm_eventinfo { int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); +void vm_disable_vcpu_creation(struct vm *vm); void vm_slock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c index a2cc63448f19..c6a49ebc4b03 100644 --- a/sys/arm64/vmm/vmm.c +++ b/sys/arm64/vmm/vmm.c @@ -141,6 +141,7 @@ struct vm { volatile cpuset_t active_cpus; /* (i) active vcpus */ volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ int suspend; /* (i) stop VM execution */ + bool dying; /* (o) is dying */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ @@ -405,6 +406,14 @@ vm_init(struct vm *vm, bool create) } } +void +vm_disable_vcpu_creation(struct vm *vm) +{ + sx_xlock(&vm->vcpus_init_lock); + vm->dying = true; + sx_xunlock(&vm->vcpus_init_lock); +} + struct vcpu * vm_alloc_vcpu(struct vm *vm, int vcpuid) { @@ -423,7 +432,7 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid) sx_xlock(&vm->vcpus_init_lock); vcpu = vm->vcpu[vcpuid]; - if (vcpu == NULL/* && !vm->dying*/) { + if (vcpu == NULL && !vm->dying) { vcpu = vcpu_alloc(vm, vcpuid); vcpu_init(vcpu); From d730cdea2ab32750e45955dd2e570152b6f81def Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 13 Jun 2024 20:16:57 -0400 Subject: [PATCH 68/91] arm64/vmm: Avoid unnecessary indirection in vmmops_modinit() Most of vmm.h is machine-independent. Simplify merging amd64 and arm64 vmm code by removing this machine-dependent routine from arm64's vmm.h. No functional change intended. Reviewed by: andrew Differential Revision: https://reviews.freebsd.org/D45557 --- sys/arm64/include/vmm.h | 7 ------- sys/arm64/vmm/vmm_arm64.c | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/sys/arm64/include/vmm.h b/sys/arm64/include/vmm.h index cf00dd60a43f..05b013557c06 100644 --- a/sys/arm64/include/vmm.h +++ b/sys/arm64/include/vmm.h @@ -201,13 +201,6 @@ cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); -static __inline bool -virt_enabled(void) -{ - - return (has_hyp()); -} - static __inline int vcpu_rendezvous_pending(struct vm_eventinfo *info) { diff --git a/sys/arm64/vmm/vmm_arm64.c b/sys/arm64/vmm/vmm_arm64.c index 6b058a993cdd..1b61871014a7 100644 --- a/sys/arm64/vmm/vmm_arm64.c +++ b/sys/arm64/vmm/vmm_arm64.c @@ -239,7 +239,7 @@ vmmops_modinit(int ipinum) int cpu, i; bool rv __diagused; - if (!virt_enabled()) { + if (!has_hyp()) { printf( "vmm: Processor doesn't have support for virtualization\n"); return (ENXIO); From aede0d3badd1be92b57deb14c494785ab61022d4 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 13 Jun 2024 20:17:07 -0400 Subject: [PATCH 69/91] amd64/vmm: Make vmm.h more self-contained CTASSERT is defined in kassert.h, so include that here. No functional change intended. MFC after: 1 week --- sys/amd64/include/vmm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 08c54ed7c49b..37972d54bd99 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -144,6 +144,8 @@ enum x2apic_state { (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) #ifdef _KERNEL +#include + CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN); struct vm; From fbff6d54da146e98ec2ce4ebfbb86339d4f9fa21 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 14 Jun 2024 00:09:45 -0400 Subject: [PATCH 70/91] vm_phys: Fix vm_phys_find_range() after commit 69cbb18746b6 vm_phys_seg_paddr_to_vm_page() expects a PA that's in bounds, but vm_phys_find_range() purposefully returns a pointer to the end of the last page in a segment. Fixes: 69cbb18746b6 ("vm_phys: Add a vm_phys_seg_paddr_to_vm_page() helper") --- sys/vm/vm_phys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index a28b3a40691e..53e58283eb9f 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -1450,7 +1450,7 @@ vm_phys_find_range(vm_page_t bounds[], int segind, int domain, vm_phys_lazy_init_domain(domain, false); #endif bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start); - bounds[1] = vm_phys_seg_paddr_to_vm_page(seg, pa_end); + bounds[1] = &seg->first_page[atop(pa_end - seg->start)]; return (seg - vm_phys_segs); } return (-1); From 2a21cfe60fcf8e49dad60a9a40d2fc9e62cda563 Mon Sep 17 00:00:00 2001 From: Doug Moore Date: Fri, 14 Jun 2024 02:19:03 -0500 Subject: [PATCH 71/91] pctrie: avoid typecast Have PCTRIE_RECLAIM_CALLBACK typecast one function pointer type to another, to relieve the writer of the call back function from having to cast its first argument from void* to member type. Reviewed by: rlibby Differential Revision: https://reviews.freebsd.org/D45586 --- sys/kern/subr_rangeset.c | 3 +-- sys/sys/pctrie.h | 3 ++- sys/vm/swap_pager.c | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/sys/kern/subr_rangeset.c b/sys/kern/subr_rangeset.c index 94b77a449d22..ddb1839531ea 100644 --- a/sys/kern/subr_rangeset.c +++ b/sys/kern/subr_rangeset.c @@ -233,9 +233,8 @@ rangeset_remove(struct rangeset *rs, uint64_t start, uint64_t end) } static void -rangeset_remove_leaf(void *rv, void *rsv) +rangeset_remove_leaf(struct rs_el *r, void *rsv) { - struct rs_el *r = rv; struct rangeset *rs = rsv; rs->rs_free_data(rs->rs_data_ctx, r); diff --git a/sys/sys/pctrie.h b/sys/sys/pctrie.h index 4e1d8c7f8617..d06b533a54b7 100644 --- a/sys/sys/pctrie.h +++ b/sys/sys/pctrie.h @@ -226,9 +226,10 @@ name##_PCTRIE_RECLAIM(struct pctrie *ptree) \ */ \ static __inline __unused void \ name##_PCTRIE_RECLAIM_CALLBACK(struct pctrie *ptree, \ - pctrie_cb_t callback, void *arg) \ + void (*typed_cb)(struct type *, void *), void *arg) \ { \ struct pctrie_node *freenode, *node; \ + pctrie_cb_t callback = (pctrie_cb_t)typed_cb; \ \ for (freenode = pctrie_reclaim_begin_cb(&node, ptree, \ callback, __offsetof(struct type, field), arg); \ diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index c339f70ddea1..455c39ab6fd7 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -2220,9 +2220,8 @@ swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count, } static void -swp_pager_meta_free_block(void *sbv, void *rangev) +swp_pager_meta_free_block(struct swblk *sb, void *rangev) { - struct swblk *sb = sbv; struct page_range *range = rangev; for (int i = 0; i < SWAP_META_PAGES; i++) { From 70e3e1bde9bf0ddf7fc18b97487fe2d23d4fc5ed Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Thu, 13 Jun 2024 11:23:26 -0400 Subject: [PATCH 72/91] tzsetup: add detail for -s option Skipping the UTC question via -s will not create or delete /etc/wall_cmos_clock. Reported by: Tomoaki AOKI Reviewed by: imp, allanjude, jrm Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D45576 --- usr.sbin/tzsetup/tzsetup.8 | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/usr.sbin/tzsetup/tzsetup.8 b/usr.sbin/tzsetup/tzsetup.8 index 60461363da9a..499d25765541 100644 --- a/usr.sbin/tzsetup/tzsetup.8 +++ b/usr.sbin/tzsetup/tzsetup.8 @@ -22,7 +22,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd April 18, 2023 +.Dd June 14, 2024 .Dt TZSETUP 8 .Os .Sh NAME @@ -59,6 +59,10 @@ The name is obtained from .Pa /var/db/zoneinfo . .It Fl s Skip the initial question about adjusting the clock if not set to UTC. +.Nm +will neither create nor delete +.Pa /etc/wall_cmos_clock . +On a newly installed system the hardware clock will keep UTC. .El .Pp It is possible to short-circuit the menu system by specifying the From de4bfd6b9987987f82485e582beaa26338ed01d2 Mon Sep 17 00:00:00 2001 From: Reid Linnemann Date: Tue, 7 May 2024 16:46:35 -0600 Subject: [PATCH 73/91] udp_var: correct intoudpcb macro unintended identifier dependency Change 483fe9651 embedded struct inpcb into struct udpcb and updated the intoudpcb macro to use __containerof to locate it. This change accidentally introduced a dependency on the identifier inp being defined in the block the macro is expanded in. This should have been the macro argument ip. This change makes this simple correction. No functional change intended. Reviewed by: kp Sponsored by: Rubicon Communications, LLC ("Netgate") --- sys/netinet/udp_var.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h index 0d70bad91df4..2528e4fcb30f 100644 --- a/sys/netinet/udp_var.h +++ b/sys/netinet/udp_var.h @@ -120,7 +120,7 @@ struct udpcb { void *u_tun_ctx; /* Tunneling callback context. */ }; -#define intoudpcb(ip) __containerof((inp), struct udpcb, u_inpcb) +#define intoudpcb(ip) __containerof((ip), struct udpcb, u_inpcb) #define sotoudpcb(so) (intoudpcb(sotoinpcb(so))) VNET_PCPUSTAT_DECLARE(struct udpstat, udpstat); From ec7358885de020711b30114df59f21c6bf042639 Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Fri, 14 Jun 2024 08:29:08 -0600 Subject: [PATCH 74/91] libkern.h: Make more self-sufficient libkern.h uses KASSERT, which fails when building in the boot loader. This is hacked around in a number of other places, but it's easier to just include sys/kassert.h here. Those other hacks still work, but are no longer really needed and can be torn down over time. Reviewed by: emaste Sponsored by: Netflix --- sys/sys/libkern.h | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/sys/libkern.h b/sys/sys/libkern.h index afdfe7346b28..6dc4bc87c3fe 100644 --- a/sys/sys/libkern.h +++ b/sys/sys/libkern.h @@ -36,6 +36,7 @@ #ifdef _KERNEL #include #endif +#include #ifndef LIBKERN_INLINE #define LIBKERN_INLINE static __inline From 4fd5b8aed89643707556fa859ba58bcfd335f6f7 Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Fri, 14 Jun 2024 08:30:30 -0600 Subject: [PATCH 75/91] boot1.chrp: Include memset Normally, memset isn't used. However for OPT_INIT_ALL=zero it is. Always include it since we're not space constrained and latter-day loaders won't include a copy if it's not actually used. Reviewed by: emaste Sponsored by: Netflix --- stand/powerpc/boot1.chrp/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stand/powerpc/boot1.chrp/Makefile b/stand/powerpc/boot1.chrp/Makefile index 22a3f983affd..11b9e356219b 100644 --- a/stand/powerpc/boot1.chrp/Makefile +++ b/stand/powerpc/boot1.chrp/Makefile @@ -6,7 +6,7 @@ NEWVERSWHAT= "Open Firmware boot block" ${MACHINE_ARCH} INSTALLFLAGS= -b FILES= boot1.hfs -SRCS= boot1.c ashldi3.c syncicache.c +SRCS= boot1.c ashldi3.c syncicache.c memset.c CFLAGS+=-I${LDRSRC} # Load boot1.elf below kernel. From aa6fb1d277be47c51abc309ac9305def0fce7f9d Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Wed, 12 Jun 2024 19:48:24 -0400 Subject: [PATCH 76/91] tzsetup: ask local/UTC question only on x86 Storing local time in the RTC is a legacy of 1990s PCs; it's not relevant on other platforms of interest to FreeBSD. While here switch to C99 bool. Sponsored by: The FreeBSD Foundation Reviewed by: allanjude (earlier), imp (earlier) Differential Revision: https://reviews.freebsd.org/D45575 --- usr.sbin/tzsetup/tzsetup.8 | 3 ++- usr.sbin/tzsetup/tzsetup.c | 25 +++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/usr.sbin/tzsetup/tzsetup.8 b/usr.sbin/tzsetup/tzsetup.8 index 499d25765541..4e70875ec74b 100644 --- a/usr.sbin/tzsetup/tzsetup.8 +++ b/usr.sbin/tzsetup/tzsetup.8 @@ -62,7 +62,8 @@ Skip the initial question about adjusting the clock if not set to UTC. .Nm will neither create nor delete .Pa /etc/wall_cmos_clock . -On a newly installed system the hardware clock will keep UTC. +On a newly installed system, the hardware clock will keep UTC. +This option is enabled automatically on non-x86 hardware. .El .Pp It is possible to short-circuit the menu system by specifying the diff --git a/usr.sbin/tzsetup/tzsetup.c b/usr.sbin/tzsetup/tzsetup.c index 617de4efb765..6629dd81f250 100644 --- a/usr.sbin/tzsetup/tzsetup.c +++ b/usr.sbin/tzsetup/tzsetup.c @@ -826,23 +826,28 @@ main(int argc, char **argv) char prompt[128]; int fd; #endif - int c, rv, skiputc; + int c, rv; + bool skiputc; + char *dztpath; +#if defined(__i386__) || defined(__amd64__) char vm_guest[16] = ""; size_t len = sizeof(vm_guest); - char *dztpath; + skiputc = false; + + /* Default skiputc to true for VM guests */ + if (sysctlbyname("kern.vm_guest", vm_guest, &len, NULL, 0) == 0 && + strcmp(vm_guest, "none") != 0) + skiputc = true; +#else + skiputc = true; +#endif dztpath = NULL; - skiputc = 0; #ifdef HAVE_BSDDIALOG setlocale(LC_ALL, ""); #endif - /* Default skiputc to 1 for VM guests */ - if (sysctlbyname("kern.vm_guest", vm_guest, &len, NULL, 0) == 0 && - strcmp(vm_guest, "none") != 0) - skiputc = 1; - while ((c = getopt(argc, argv, "C:d:nrs")) != -1) { switch (c) { case 'C': @@ -861,7 +866,7 @@ main(int argc, char **argv) #endif break; case 's': - skiputc = 1; + skiputc = true; break; default: usage(); @@ -951,7 +956,7 @@ main(int argc, char **argv) if (bsddialog_init() == BSDDIALOG_ERROR) errx(1, "Error bsddialog: %s\n", bsddialog_geterror()); - if (skiputc == 0) { + if (!skiputc) { snprintf(prompt, sizeof(prompt), "Is this machine's CMOS clock set to UTC? " "If it is set to local time,\n" From fb1028dcd4aedc4d48dbd97314f008c663b2e711 Mon Sep 17 00:00:00 2001 From: Adam Retter Date: Fri, 14 Jun 2024 11:57:15 -0400 Subject: [PATCH 77/91] hda: add support for Tiger Lake-H PR: 272682 Reported by: Miguel Salcedo Reviewed by: emaste --- sys/dev/sound/pci/hda/hdac.c | 1 + sys/dev/sound/pci/hda/hdac.h | 2 ++ sys/dev/sound/pci/hda/hdacc.c | 1 + 3 files changed, 4 insertions(+) diff --git a/sys/dev/sound/pci/hda/hdac.c b/sys/dev/sound/pci/hda/hdac.c index e45c121eb10b..336602b6bbf4 100644 --- a/sys/dev/sound/pci/hda/hdac.c +++ b/sys/dev/sound/pci/hda/hdac.c @@ -109,6 +109,7 @@ static const struct { { HDA_INTEL_CMLKLP, "Intel Comet Lake-LP", 0, 0 }, { HDA_INTEL_CMLKH, "Intel Comet Lake-H", 0, 0 }, { HDA_INTEL_TGLK, "Intel Tiger Lake", 0, 0 }, + { HDA_INTEL_TGLKH, "Intel Tiger Lake-H", 0, 0 }, { HDA_INTEL_GMLK, "Intel Gemini Lake", 0, 0 }, { HDA_INTEL_ALLK, "Intel Alder Lake", 0, 0 }, { HDA_INTEL_ALLKM, "Intel Alder Lake-M", 0, 0 }, diff --git a/sys/dev/sound/pci/hda/hdac.h b/sys/dev/sound/pci/hda/hdac.h index b40bfc7f6da3..4b4bd70f800f 100644 --- a/sys/dev/sound/pci/hda/hdac.h +++ b/sys/dev/sound/pci/hda/hdac.h @@ -98,6 +98,7 @@ #define HDA_INTEL_CMLKLP HDA_MODEL_CONSTRUCT(INTEL, 0x02c8) #define HDA_INTEL_CMLKH HDA_MODEL_CONSTRUCT(INTEL, 0x06c8) #define HDA_INTEL_TGLK HDA_MODEL_CONSTRUCT(INTEL, 0xa0c8) +#define HDA_INTEL_TGLKH HDA_MODEL_CONSTRUCT(INTEL, 0x43c8) #define HDA_INTEL_MTL HDA_MODEL_CONSTRUCT(INTEL, 0x7e28) #define HDA_INTEL_ARLS HDA_MODEL_CONSTRUCT(INTEL, 0x7f50) #define HDA_INTEL_ARL HDA_MODEL_CONSTRUCT(INTEL, 0x7728) @@ -911,6 +912,7 @@ #define HDA_CODEC_INTELGMLK1 HDA_CODEC_CONSTRUCT(INTEL, 0x280d) #define HDA_CODEC_INTELICLK HDA_CODEC_CONSTRUCT(INTEL, 0x280f) #define HDA_CODEC_INTELTGLK HDA_CODEC_CONSTRUCT(INTEL, 0x2812) +#define HDA_CODEC_INTELTGLKH HDA_CODEC_CONSTRUCT(INTEL, 0x2814) #define HDA_CODEC_INTELALLK HDA_CODEC_CONSTRUCT(INTEL, 0x2815) #define HDA_CODEC_INTELJLK HDA_CODEC_CONSTRUCT(INTEL, 0x281a) #define HDA_CODEC_INTELELLK HDA_CODEC_CONSTRUCT(INTEL, 0x281b) diff --git a/sys/dev/sound/pci/hda/hdacc.c b/sys/dev/sound/pci/hda/hdacc.c index 009c9098ac3b..81395a1a9ae7 100644 --- a/sys/dev/sound/pci/hda/hdacc.c +++ b/sys/dev/sound/pci/hda/hdacc.c @@ -393,6 +393,7 @@ static const struct { { HDA_CODEC_INTELGMLK1, 0, "Intel Gemini Lake" }, { HDA_CODEC_INTELICLK, 0, "Intel Ice Lake" }, { HDA_CODEC_INTELTGLK, 0, "Intel Tiger Lake" }, + { HDA_CODEC_INTELTGLKH, 0, "Intel Tiger Lake-H" }, { HDA_CODEC_INTELALLK, 0, "Intel Alder Lake" }, { HDA_CODEC_SII1390, 0, "Silicon Image SiI1390" }, { HDA_CODEC_SII1392, 0, "Silicon Image SiI1392" }, From 97ab935d566c85458bc7cab99c402f026d704eff Mon Sep 17 00:00:00 2001 From: "John F. Carr" Date: Fri, 14 Jun 2024 13:37:06 -0300 Subject: [PATCH 78/91] rk_pinctrl: fix error check The parse_bias method returns a signed int, with a value of -1 when the device tree reports nothing of the bias configuration. Convert the local 'bias' from unsigned to signed to properly check this condition. PR: 229721 Reviewed by: mhorne MFC after: 3 days --- sys/arm64/rockchip/rk_pinctrl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sys/arm64/rockchip/rk_pinctrl.c b/sys/arm64/rockchip/rk_pinctrl.c index 4892b0bf236c..6137f7f0d50a 100644 --- a/sys/arm64/rockchip/rk_pinctrl.c +++ b/sys/arm64/rockchip/rk_pinctrl.c @@ -1141,9 +1141,9 @@ rk_pinctrl_configure_pin(struct rk_pinctrl_softc *sc, uint32_t *pindata) { phandle_t pin_conf; struct syscon *syscon; - uint32_t bank, subbank, pin, function, bias; + uint32_t bank, subbank, pin, function; uint32_t bit, mask, reg, drive; - int i, rv; + int i, rv, bias; bank = pindata[0]; pin = pindata[1]; From cadc9c7db780877396da23f3683a8c69c9b1c01e Mon Sep 17 00:00:00 2001 From: "John F. Carr" Date: Fri, 14 Jun 2024 11:06:03 -0600 Subject: [PATCH 79/91] boot/efi: Fix warning for non-standard formats when debugging Add -Wno-format for zfs_module and regroup. This fixes warnings when EFI_DEBUG is defined. PR: 279071 Reviewed-by: imp --- stand/efi/boot1/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/stand/efi/boot1/Makefile b/stand/efi/boot1/Makefile index fb1c7d74eec1..607c30f10cf7 100644 --- a/stand/efi/boot1/Makefile +++ b/stand/efi/boot1/Makefile @@ -12,11 +12,12 @@ CFLAGS+= -DEFI_BOOT1 # seems to matter on arm64 where wchar_t defaults to an int instead # of a short. There's no good cast to use here so just ignore the # warnings for now. -CWARNFLAGS.proto.c+= -Wno-format -CWARNFLAGS.boot1.c+= -Wno-format +CWARNFLAGS.proto.c += -Wno-format +CWARNFLAGS.boot1.c += -Wno-format +CWARNFLAGS.ufs_module.c += -Wno-format +CWARNFLAGS.zfs_module.c += -Wno-format # Disable bogus alignment issues -CWARNFLAGS.ufs_module.c += -Wno-format CWARNFLAGS.ufs_module.c += -Wno-cast-align # Disable warnings that are currently incompatible with the zfs boot code From 0e4e77072f65df71644140e41229b868a73f3371 Mon Sep 17 00:00:00 2001 From: Mitchell Horne Date: Fri, 14 Jun 2024 14:57:28 -0300 Subject: [PATCH 80/91] riscv: adjust physmem reservation Make sure we do this BEFORE pmap_bootstrap(). Reviewed by: markj, jhb MFC after: 3 days Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D45325 --- sys/riscv/riscv/machdep.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sys/riscv/riscv/machdep.c b/sys/riscv/riscv/machdep.c index f44056f56212..1e2b96acd3a1 100644 --- a/sys/riscv/riscv/machdep.c +++ b/sys/riscv/riscv/machdep.c @@ -549,10 +549,6 @@ initriscv(struct riscv_bootparams *rvbp) cache_setup(); - /* Bootstrap enough of pmap to enter the kernel proper */ - kernlen = (lastaddr - KERNBASE); - pmap_bootstrap(rvbp->kern_l1pt, rvbp->kern_phys, kernlen); - #ifdef FDT /* * XXX: Unconditionally exclude the lowest 2MB of physical memory, as @@ -565,6 +561,11 @@ initriscv(struct riscv_bootparams *rvbp) physmem_exclude_region(mem_regions[0].mr_start, L2_SIZE, EXFLAG_NODUMP | EXFLAG_NOALLOC); #endif + + /* Bootstrap enough of pmap to enter the kernel proper */ + kernlen = (lastaddr - KERNBASE); + pmap_bootstrap(rvbp->kern_l1pt, rvbp->kern_phys, kernlen); + physmem_init_kernel_globals(); /* Establish static device mappings */ From 3ff981587ff8f05c06092d05edcc50f1ede1bbd6 Mon Sep 17 00:00:00 2001 From: Mitchell Horne Date: Fri, 24 Nov 2023 13:21:51 -0400 Subject: [PATCH 81/91] riscv: Don't handle missing kernel L3 pages This code path should never be hit, if it does it means we did not bootstrap correctly. Turn it into a panic like we do on amd64 and arm64. Reviewed by: markj, jhb Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D45326 --- sys/riscv/riscv/pmap.c | 43 +++++------------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c index 8176975b049c..1902f1f4009b 100644 --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -2911,14 +2911,13 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; - pd_entry_t *l1, *l2, l2e; + pd_entry_t *l2, l2e; pt_entry_t new_l3, orig_l3; pt_entry_t *l3; pv_entry_t pv; - vm_paddr_t opa, pa, l2_pa, l3_pa; - vm_page_t mpte, om, l2_m, l3_m; - pt_entry_t entry; - pn_t l2_pn, l3_pn, pn; + vm_paddr_t opa, pa; + vm_page_t mpte, om; + pn_t pn; int rv; bool nosleep; @@ -2990,39 +2989,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, } l3 = pmap_l3(pmap, va); } else { - l3 = pmap_l3(pmap, va); - /* TODO: This is not optimal, but should mostly work */ - if (l3 == NULL) { - if (l2 == NULL) { - l2_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | - VM_ALLOC_ZERO); - if (l2_m == NULL) - panic("pmap_enter: l2 pte_m == NULL"); - - l2_pa = VM_PAGE_TO_PHYS(l2_m); - l2_pn = (l2_pa / PAGE_SIZE); - - l1 = pmap_l1(pmap, va); - entry = (PTE_V); - entry |= (l2_pn << PTE_PPN0_S); - pmap_store(l1, entry); - pmap_distribute_l1(pmap, pmap_l1_index(va), entry); - l2 = pmap_l1_to_l2(l1, va); - } - - l3_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | - VM_ALLOC_ZERO); - if (l3_m == NULL) - panic("pmap_enter: l3 pte_m == NULL"); - - l3_pa = VM_PAGE_TO_PHYS(l3_m); - l3_pn = (l3_pa / PAGE_SIZE); - entry = (PTE_V); - entry |= (l3_pn << PTE_PPN0_S); - pmap_store(l2, entry); - l3 = pmap_l2_to_l3(l2, va); - } - pmap_invalidate_page(pmap, va); + panic("pmap_enter: missing L3 table for kernel va %#lx", va); } orig_l3 = pmap_load(l3); From 134f7b5fa91fa41e3c3c7caeaf1dff72018684e5 Mon Sep 17 00:00:00 2001 From: Mitchell Horne Date: Mon, 3 Jun 2024 14:24:12 -0300 Subject: [PATCH 82/91] riscv: improve commentary around initial stvec Make it explicit why we must set the trap vector before enabling virtual memory. Reviewed by: br, jhb, markj Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D45474 --- sys/riscv/riscv/locore.S | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sys/riscv/riscv/locore.S b/sys/riscv/riscv/locore.S index 5a7e15ba443b..7e885ab8ca89 100644 --- a/sys/riscv/riscv/locore.S +++ b/sys/riscv/riscv/locore.S @@ -197,7 +197,11 @@ pagetables: /* Page tables END */ - /* Setup supervisor trap vector */ + /* + * Set the supervisor trap vector temporarily. Enabling virtual memory + * may generate a page fault. We simply wish to continue onwards, so + * have the trap deliver us to 'va'. + */ 2: lla t0, va sub t0, t0, s9 @@ -221,7 +225,7 @@ va: lla gp, __global_pointer$ .option pop - /* Setup supervisor trap vector */ + /* Set the trap vector to the real handler. */ la t0, cpu_exception_handler csrw stvec, t0 @@ -342,7 +346,11 @@ ENTRY(mpentry) /* Get the kernel's load address */ jal get_physmem - /* Setup supervisor trap vector */ + /* + * Set the supervisor trap vector temporarily. Enabling virtual memory + * may generate a page fault. We simply wish to continue onwards, so + * have the trap deliver us to 'mpva'. + */ lla t0, mpva sub t0, t0, s9 li t1, KERNBASE @@ -365,7 +373,7 @@ mpva: lla gp, __global_pointer$ .option pop - /* Setup supervisor trap vector */ + /* Set the trap vector to the real handler. */ la t0, cpu_exception_handler csrw stvec, t0 From 2909ddd17cb4d750852dc04128e584f93f8c5058 Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Wed, 12 Jun 2024 15:34:05 -0600 Subject: [PATCH 83/91] ctld: plug memory leaks MFC after: 2 weeks Reviewed by: mav Sponsored by: Axcient Reported by: valgrind Pull Request: https://github.com/freebsd/freebsd-src/pull/1288 --- usr.sbin/ctld/ctld.c | 1 + usr.sbin/ctld/kernel.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/usr.sbin/ctld/ctld.c b/usr.sbin/ctld/ctld.c index c31ac328c84e..bf2791040125 100644 --- a/usr.sbin/ctld/ctld.c +++ b/usr.sbin/ctld/ctld.c @@ -2873,6 +2873,7 @@ main(int argc, char **argv) error = conf_apply(oldconf, newconf); if (error != 0) log_warnx("failed to apply configuration"); + conf_delete(newconf); conf_delete(oldconf); oldconf = NULL; diff --git a/usr.sbin/ctld/kernel.c b/usr.sbin/ctld/kernel.c index dd1c89d0e9b8..ae455e7815f7 100644 --- a/usr.sbin/ctld/kernel.c +++ b/usr.sbin/ctld/kernel.c @@ -614,6 +614,22 @@ conf_new_from_kernel(void) } cp->p_ctl_port = port->port_id; } + while ((port = STAILQ_FIRST(&devlist.port_list))) { + struct cctl_lun_nv *nv; + + STAILQ_REMOVE_HEAD(&devlist.port_list, links); + free(port->port_frontend); + free(port->port_name); + free(port->cfiscsi_target); + free(port->ctld_portal_group_name); + while ((nv = STAILQ_FIRST(&port->attr_list))) { + STAILQ_REMOVE_HEAD(&port->attr_list, links); + free(nv->value); + free(nv->name); + free(nv); + } + free(port); + } free(name); STAILQ_FOREACH(lun, &devlist.lun_list, links) { @@ -664,6 +680,18 @@ conf_new_from_kernel(void) cl->l_name); } } + while ((lun = STAILQ_FIRST(&devlist.lun_list))) { + struct cctl_lun_nv *nv; + + STAILQ_REMOVE_HEAD(&devlist.lun_list, links); + while ((nv = STAILQ_FIRST(&lun->attr_list))) { + STAILQ_REMOVE_HEAD(&lun->attr_list, links); + free(nv->value); + free(nv->name); + free(nv); + } + free(lun); + } return (conf); } @@ -741,12 +769,14 @@ kernel_lun_add(struct lun *lun) req.args = nvlist_pack(req.args_nvl, &req.args_len); if (req.args == NULL) { + nvlist_destroy(req.args_nvl); log_warn("error packing nvlist"); return (1); } } error = ioctl(ctl_fd, CTL_LUN_REQ, &req); + free(req.args); nvlist_destroy(req.args_nvl); if (error != 0) { @@ -824,12 +854,14 @@ kernel_lun_modify(struct lun *lun) req.args = nvlist_pack(req.args_nvl, &req.args_len); if (req.args == NULL) { + nvlist_destroy(req.args_nvl); log_warn("error packing nvlist"); return (1); } } error = ioctl(ctl_fd, CTL_LUN_REQ, &req); + free(req.args); nvlist_destroy(req.args_nvl); if (error != 0) { @@ -1052,6 +1084,7 @@ kernel_port_add(struct port *port) req.args = nvlist_pack(req.args_nvl, &req.args_len); if (req.args == NULL) { + nvlist_destroy(req.args_nvl); log_warn("error packing nvlist"); return (1); } @@ -1059,6 +1092,7 @@ kernel_port_add(struct port *port) req.result = result_buf; req.result_len = sizeof(result_buf); error = ioctl(ctl_fd, CTL_PORT_REQ, &req); + free(req.args); nvlist_destroy(req.args_nvl); if (error != 0) { @@ -1202,11 +1236,13 @@ kernel_port_remove(struct port *port) req.args = nvlist_pack(req.args_nvl, &req.args_len); if (req.args == NULL) { + nvlist_destroy(req.args_nvl); log_warn("error packing nvlist"); return (1); } error = ioctl(ctl_fd, CTL_PORT_REQ, &req); + free(req.args); nvlist_destroy(req.args_nvl); if (error != 0) { From a70ecfb11757812cd97b6499dc4b73984c310681 Mon Sep 17 00:00:00 2001 From: Alexander Leidinger Date: Fri, 14 Jun 2024 20:05:52 +0200 Subject: [PATCH 84/91] rc.subr: add new sysv option for service jails Clarify that the "sysvipc" svcj option inherits from the host / parent. Add "sysvipcnew" which creates a new SysV namespace for the service jail. Sanity check that only one of them is used. --- libexec/rc/rc.subr | 11 +++++++++++ share/man/man5/rc.conf.5 | 8 ++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/libexec/rc/rc.subr b/libexec/rc/rc.subr index 2380d1aeabc3..f9d8bf9a3cc3 100644 --- a/libexec/rc/rc.subr +++ b/libexec/rc/rc.subr @@ -1219,6 +1219,7 @@ run_rc_command() if [ -n "$_svcj_options" ]; then # translate service jail options _svcj_cmd_options="" + _svcj_sysvipc_x=0 for _svcj_option in $_svcj_options; do case "$_svcj_option" in mlock) @@ -1243,8 +1244,13 @@ run_rc_command() _svcj_cmd_options="allow.nfsd enforce_statfs=1 ${_svcj_cmd_options}" ;; sysvipc) + _svcj_sysvipc_x=$((${_svcj_sysvipc_x} + 1)) _svcj_cmd_options="sysvmsg=inherit sysvsem=inherit sysvshm=inherit ${_svcj_cmd_options}" ;; + sysvipcnew) + _svcj_sysvipc_x=$((${_svcj_sysvipc_x} + 1)) + _svcj_cmd_options="sysvmsg=new sysvsem=new sysvshm=new ${_svcj_cmd_options}" + ;; vmm) _svcj_cmd_options="allow.vmm ${_svcj_cmd_options}" ;; @@ -1253,6 +1259,11 @@ run_rc_command() ;; esac done + if [ ${_svcj_sysvipc_x} -gt 1 ]; then + echo -n "ERROR: more than one sysvipc option is " + echo "specified in ${name}_svcj_options: $_svcj_options" + return 1 + fi fi [ -z "$autoboot" ] && eval $_pidcmd # determine the pid if necessary diff --git a/share/man/man5/rc.conf.5 b/share/man/man5/rc.conf.5 index 9f32cfa5ab82..3fa20fe3cf0c 100644 --- a/share/man/man5/rc.conf.5 +++ b/share/man/man5/rc.conf.5 @@ -4977,8 +4977,11 @@ to them. .It nfsd Allows to run nfsd and affiliated daemons. .It sysvipc -Allows access to SysV semaphores, SysV shared memory and -SysV messages. +Inherits the SysV semaphores, SysV shared memory and +SysV messages from the host or the parent jail. +.It sysvipcnew +Creates a new namespace for SysV semaphores, SysV shared memory +and SysV messages for this particular service jail. .It vmm Allows access to .Xr vmm 4 . @@ -4988,6 +4991,7 @@ is enabled in the kernel. .El All non-network options can be combined with all other options. +From the SysV options only one option can be specified. If the .Ao Ar name Ac Ns Va _svcj From 2d08f6b577e9d58848cd7734dc979e60fe6f0165 Mon Sep 17 00:00:00 2001 From: Alexander Leidinger Date: Fri, 14 Jun 2024 20:10:07 +0200 Subject: [PATCH 85/91] rc.subr: add some sanity checks for service jails Add some sanity checks when service jails are used in jails: - children.max > 0 - children.max - children.cur > 0 The nesting is too deep at those places to have a sane formatting, so no line wrapping at the usual column. If someone has a better idea how to format this: feel free to go ahead. --- libexec/rc/rc.subr | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/libexec/rc/rc.subr b/libexec/rc/rc.subr index f9d8bf9a3cc3..e540d8f7d207 100644 --- a/libexec/rc/rc.subr +++ b/libexec/rc/rc.subr @@ -1332,11 +1332,28 @@ run_rc_command() start) if [ "${_rc_svcj}" != jailing ]; then _return=1 - $JAIL_CMD -c $_svcj_generic_params $_svcj_cmd_options \ - exec.start="${SERVICE} -E _rc_svcj=jailing ${name} ${_rc_prefix}start $rc_extra_args" \ - exec.stop="${SERVICE} -E _rc_svcj=jailing ${name} ${_rc_prefix}stop $rc_extra_args" \ - exec.consolelog="/var/log/svcj_${name}_console.log" \ - name=svcj-${name} && _return=0 + _do_jailing=1 + + if check_jail jailed; then + if [ $(${SYSCTL_N} security.jail.children.max) -eq 0 ]; then + echo ERROR: jail parameter children.max is set to 0, can not create a new service jail. + _do_jailing=0 + else + _free_jails=$(($(${SYSCTL_N} security.jail.children.max) - $(${SYSCTL_N} security.jail.children.cur))) + if [ ${_free_jails} -eq 0 ]; then + echo ERROR: max number of jail children reached, can not create a new service jail. + _do_jailing=0 + + fi + fi + fi + if [ ${_do_jailing} -eq 1 ]; then + $JAIL_CMD -c $_svcj_generic_params $_svcj_cmd_options \ + exec.start="${SERVICE} -E _rc_svcj=jailing ${name} ${_rc_prefix}start $rc_extra_args" \ + exec.stop="${SERVICE} -E _rc_svcj=jailing ${name} ${_rc_prefix}stop $rc_extra_args" \ + exec.consolelog="/var/log/svcj_${name}_console.log" \ + name=svcj-${name} && _return=0 + fi else _run_rc_doit "$_cpusetcmd $_cmd $rc_extra_args" || _return=1 fi @@ -1432,6 +1449,18 @@ run_rc_command() if checkyesno ${name}_svcj; then if [ "${_rc_svcj}" != jailing ]; then + if check_jail jailed; then + if [ $(${SYSCTL_N} security.jail.children.max) -eq 0 ]; then + echo ERROR: jail parameter children.max is set to 0, can not create a new service jail. + return 1 + else + _free_jails=$(($(${SYSCTL_N} security.jail.children.max) - $(${SYSCTL_N} security.jail.children.cur))) + if [ ${_free_jails} -eq 0 ]; then + echo ERROR: max number of jail children reached, can not create a new service jail. + return 1 + fi + fi + fi $JAIL_CMD -c $_svcj_generic_params $_svcj_cmd_options\ exec.start="${SERVICE} -E _rc_svcj=jailing ${name} ${_rc_prefix}start $rc_extra_args" \ exec.stop="${SERVICE} -E _rc_svcj=jailing ${name} ${_rc_prefix}stop $rc_extra_args" \ From 80b42329248a7473f79eebf7850ee8e4116c17dd Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 14 Jun 2024 15:32:10 -0400 Subject: [PATCH 86/91] nvme: Fix panic on detach after ce75bfcac9cfe MFC after: 2 weeks --- sys/dev/nvme/nvme_ns.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c index b19fc8664407..4c65e2c49e64 100644 --- a/sys/dev/nvme/nvme_ns.c +++ b/sys/dev/nvme/nvme_ns.c @@ -620,8 +620,9 @@ void nvme_ns_destruct(struct nvme_namespace *ns) { - if (ns->cdev->si_drv2 != NULL) - destroy_dev(ns->cdev->si_drv2); - if (ns->cdev != NULL) + if (ns->cdev != NULL) { + if (ns->cdev->si_drv2 != NULL) + destroy_dev(ns->cdev->si_drv2); destroy_dev(ns->cdev); + } } From 517c5854588eaa7c2248d97cd750b8b8bad9d69f Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 14 Jun 2024 10:45:02 -0400 Subject: [PATCH 87/91] vm_phys: Make sure that vm_phys_enq_chunk() stays in bounds vm_phys_enq_chunk() inserts a run of pages into the buddy queues. When lazy initialization is enabled, only the first page of each run is initialized; vm_phys_enq_chunk() thus initializes the page following the just-inserted run. This fails to account for the possibility that the page following the run doesn't belong to the segment. Handle that in vm_phys_enq_chunk(). Reported by: KASAN Reported by: syzbot+1097ef4cee8dfb240e31@syzkaller.appspotmail.com Fixes: b16b4c22d2d1 ("vm_page: Implement lazy page initialization") --- sys/vm/vm_phys.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index 53e58283eb9f..59ab7d13c55d 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -711,12 +711,16 @@ vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int tail) #ifdef VM_FREEPOOL_LAZYINIT if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { vm_page_t m_next; + vm_paddr_t pa; int npages; npages = 1 << order; m_next = m + npages; - vm_page_init_page(m_next, m->phys_addr + ptoa(npages), m->segind, - VM_FREEPOOL_LAZYINIT); + pa = m->phys_addr + ptoa(npages); + if (pa < vm_phys_segs[m->segind].end) { + vm_page_init_page(m_next, pa, m->segind, + VM_FREEPOOL_LAZYINIT); + } } #endif } From 5198178f3eb6f9d1eae68adf00ac4aef62b2bb1d Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Fri, 14 Jun 2024 15:06:53 -0600 Subject: [PATCH 88/91] cdefs.h: Add POSIX.1-2024 values Sponsored by: Netflix --- sys/sys/cdefs.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sys/sys/cdefs.h b/sys/sys/cdefs.h index c3268791787f..476c89d1dddb 100644 --- a/sys/sys/cdefs.h +++ b/sys/sys/cdefs.h @@ -639,10 +639,13 @@ * and the omnibus ISO/IEC 9945-1: 1996 * (1003.1 Issue 5, Single Unix Spec v2, Unix 95) * _POSIX_C_SOURCE == 200112 1003.1-2001 (1003.1 Issue 6, Unix 03) + * with _XOPEN_SOURCE=600 * _POSIX_C_SOURCE == 200809 1003.1-2008 (1003.1 Issue 7) * IEEE Std 1003.1-2017 (Rev of 1003.1-2008) is - * 1003.1-2008 with two TCs applied with - * _POSIX_C_SOURCE=200809 and _XOPEN_SOURCE=700 + * 1003.1-2008 with two TCs applied and + * _XOPEN_SOURCE=700 + * _POSIX_C_SOURCE == 202405 1003.1-2004 (1003.1 Issue 8), IEEE Std 1003.1-2024 + * with _XOPEN_SOURCE=800 * * In addition, the X/Open Portability Guide, which is now the Single UNIX * Specification, defines a feature-test macro which indicates the version of From 1bce7cd885e7e5b376a60367629a0f76ff7f0167 Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Fri, 14 Jun 2024 16:40:08 -0600 Subject: [PATCH 89/91] nvme: Add Linux copatible ioctls Add the NVME_IOCTL_ID, NVME_IOCTL_ADMIN_CMD, and NVME_IOCTL_IO_CMD Linux compatible ioctls. These may be run on either an I/O (ns) dev or a nvme (admin) dev. Linux allows both on either device, and programs use this and aren't careful about having the right device open. Emulate this feature, and implement these ioctls. The data is passed in into the kernel in host byte order (not converted to le). Results are returned in host order. The timeout field is ignore, and the metadata and metadata_len fields must be zero. The addr field can be null, even when the data_len is non zero (FreeBSD's ioctl interface prohibits this, Linux's just ignores the inconsistency). Only the cdw10 is returned from the command: the status is not returned in 'result' field. XXX need to verify that this is what Linux does on an error signaled from the drive. No external include file is yet available for this: most programs that call this interface either use a linux-specific path or have their own private copy of the data. It's unclear the best thing to do. Also, create a /dev/nvmeXnY as an alias for /dev/nvmeXnsY. These changes allow a native build of nvme-cli to work for everything that doesn't depend on sysfs entries in /sys, calls that use metadata, send / receive drive data and sed functionality not in our nvme driver. Sponsored by: Netflix Co-Authored-by: Chuck Tuffli Reviewed by: chuck Differential Revision: https://reviews.freebsd.org/D45415 --- sys/dev/nvme/nvme.h | 6 ++ sys/dev/nvme/nvme_ctrlr.c | 114 +++++++++++++++++++++++++++++++++++++- sys/dev/nvme/nvme_linux.h | 58 +++++++++++++++++++ sys/dev/nvme/nvme_ns.c | 14 ++++- 4 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 sys/dev/nvme/nvme_linux.h diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h index a389fc443743..1db50d24c259 100644 --- a/sys/dev/nvme/nvme.h +++ b/sys/dev/nvme/nvme.h @@ -1902,6 +1902,7 @@ struct thread; struct nvme_namespace; struct nvme_controller; struct nvme_consumer; +struct nvme_passthru_cmd; typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *); @@ -1921,6 +1922,11 @@ int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, uint32_t nsid, int is_user_buffer, int is_admin_cmd); +int nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr, + struct nvme_passthru_cmd *npc, + uint32_t nsid, bool is_user, + bool is_admin); + /* Admin functions */ void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index 155aedf2f31a..f058a4e33b9f 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -43,6 +43,7 @@ #include #include "nvme_private.h" +#include "nvme_linux.h" #define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */ @@ -1269,7 +1270,7 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, ret = EFAULT; goto err; } - req = nvme_allocate_request_vaddr(buf->b_data, pt->len, + req = nvme_allocate_request_vaddr(buf->b_data, pt->len, nvme_pt_done, pt); } else req = nvme_allocate_request_vaddr(pt->buf, pt->len, @@ -1314,6 +1315,103 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, return (ret); } +static void +nvme_npc_done(void *arg, const struct nvme_completion *cpl) +{ + struct nvme_passthru_cmd *npc = arg; + struct mtx *mtx = (void *)(uintptr_t)npc->metadata; + + npc->result = cpl->cdw0; /* cpl in host order by now */ + mtx_lock(mtx); + npc->metadata = 0; + wakeup(npc); + mtx_unlock(mtx); +} + +/* XXX refactor? */ + +int +nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr, + struct nvme_passthru_cmd *npc, uint32_t nsid, bool is_user, bool is_admin) +{ + struct nvme_request *req; + struct mtx *mtx; + struct buf *buf = NULL; + int ret = 0; + + /* + * We don't support metadata. + */ + if (npc->metadata != 0 || npc->metadata_len != 0) + return (EIO); + + if (npc->data_len > 0 && npc->addr != 0) { + if (npc->data_len > ctrlr->max_xfer_size) { + nvme_printf(ctrlr, + "npc->data_len (%d) exceeds max_xfer_size (%d)\n", + npc->data_len, ctrlr->max_xfer_size); + return (EIO); + } + /* We only support data out or data in commands, but not both at once. */ + if ((npc->opcode & 0x3) == 0 || (npc->opcode & 0x3) == 3) + return (EINVAL); + if (is_user) { + /* + * Ensure the user buffer is wired for the duration of + * this pass-through command. + */ + PHOLD(curproc); + buf = uma_zalloc(pbuf_zone, M_WAITOK); + buf->b_iocmd = npc->opcode & 1 ? BIO_WRITE : BIO_READ; + if (vmapbuf(buf, (void *)npc->addr, npc->data_len, 1) < 0) { + ret = EFAULT; + goto err; + } + req = nvme_allocate_request_vaddr(buf->b_data, npc->data_len, + nvme_npc_done, npc); + } else + req = nvme_allocate_request_vaddr((void *)npc->addr, npc->data_len, + nvme_npc_done, npc); + } else + req = nvme_allocate_request_null(nvme_npc_done, npc); + + req->cmd.opc = npc->opcode; + req->cmd.fuse = npc->flags; + req->cmd.rsvd2 = htole16(npc->cdw2); + req->cmd.rsvd3 = htole16(npc->cdw3); + req->cmd.cdw10 = htole32(npc->cdw10); + req->cmd.cdw11 = htole32(npc->cdw11); + req->cmd.cdw12 = htole32(npc->cdw12); + req->cmd.cdw13 = htole32(npc->cdw13); + req->cmd.cdw14 = htole32(npc->cdw14); + req->cmd.cdw15 = htole32(npc->cdw15); + + req->cmd.nsid = htole32(nsid); + + mtx = mtx_pool_find(mtxpool_sleep, npc); + npc->metadata = (uintptr_t) mtx; + + /* XXX no timeout passed down */ + if (is_admin) + nvme_ctrlr_submit_admin_request(ctrlr, req); + else + nvme_ctrlr_submit_io_request(ctrlr, req); + + mtx_lock(mtx); + while (npc->metadata != 0) + mtx_sleep(npc, mtx, PRIBIO, "nvme_npc", 0); + mtx_unlock(mtx); + + if (buf != NULL) { + vunmapbuf(buf); +err: + uma_zfree(pbuf_zone, buf); + PRELE(curproc); + } + + return (ret); +} + static int nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, struct thread *td) @@ -1324,6 +1422,7 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, ctrlr = cdev->si_drv1; switch (cmd) { + case NVME_IOCTL_RESET: /* Linux compat */ case NVME_RESET_CONTROLLER: nvme_ctrlr_reset(ctrlr); break; @@ -1342,6 +1441,19 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, case NVME_GET_MAX_XFER_SIZE: *(uint64_t *)arg = ctrlr->max_xfer_size; break; + /* Linux Compatible (see nvme_linux.h) */ + case NVME_IOCTL_ID: + td->td_retval[0] = 0xfffffffful; + return (0); + + case NVME_IOCTL_ADMIN_CMD: + case NVME_IOCTL_IO_CMD: { + struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg; + + return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, npc->nsid, true, + cmd == NVME_IOCTL_ADMIN_CMD)); + } + default: return (ENOTTY); } diff --git a/sys/dev/nvme/nvme_linux.h b/sys/dev/nvme/nvme_linux.h new file mode 100644 index 000000000000..aaa68e1d34f8 --- /dev/null +++ b/sys/dev/nvme/nvme_linux.h @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2024, Netflix Inc. + * Written by Warner Losh + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +/* + * Linux compatible NVME ioctls. So far we just support ID, ADMIN_CMD and + * IO_CMD. The rest are not supported. + */ + + +#include +#include + +struct nvme_passthru_cmd { + __uint8_t opcode; + __uint8_t flags; + __uint16_t rsvd1; + __uint32_t nsid; + __uint32_t cdw2; + __uint32_t cdw3; + __uint64_t metadata; + __uint64_t addr; + __uint32_t metadata_len; + __uint32_t data_len; + __uint32_t cdw10; + __uint32_t cdw11; + __uint32_t cdw12; + __uint32_t cdw13; + __uint32_t cdw14; + __uint32_t cdw15; + __uint32_t timeout_ms; + __uint32_t result; +}; + +#define nvme_admin_cmd nvme_passthru_cmd + +/* + * Linux nvme ioctls, commented out ones are not supported + */ +#define NVME_IOCTL_ID _IO('N', 0x40) +#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) +/* #define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) */ +#define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd) +#define NVME_IOCTL_RESET _IO('N', 0x44) +/* #define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) */ +/* #define NVME_IOCTL_RESCAN _IO('N', 0x46) */ +/* #define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64) */ +/* #define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) */ +/* #define NVME_IOCTL_IO64_CMD_VEC _IOWR('N', 0x49, struct nvme_passthru_cmd64) */ + +/* io_uring async commands: */ +/* #define NVME_URING_CMD_IO _IOWR('N', 0x80, struct nvme_uring_cmd) */ +/* #define NVME_URING_CMD_IO_VEC _IOWR('N', 0x81, struct nvme_uring_cmd) */ +/* #define NVME_URING_CMD_ADMIN _IOWR('N', 0x82, struct nvme_uring_cmd) */ +/* #define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd) */ diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c index 4c65e2c49e64..3f29382fe42f 100644 --- a/sys/dev/nvme/nvme_ns.c +++ b/sys/dev/nvme/nvme_ns.c @@ -43,6 +43,7 @@ #include #include "nvme_private.h" +#include "nvme_linux.h" static void nvme_bio_child_inbed(struct bio *parent, int bio_error); static void nvme_bio_child_done(void *arg, @@ -93,6 +94,18 @@ nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, case DIOCGSECTORSIZE: *(u_int *)arg = nvme_ns_get_sector_size(ns); break; + /* Linux Compatible (see nvme_linux.h) */ + case NVME_IOCTL_ID: + td->td_retval[0] = ns->id; + return (0); + + case NVME_IOCTL_ADMIN_CMD: + case NVME_IOCTL_IO_CMD: { + struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg; + + return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, ns->id, true, + cmd == NVME_IOCTL_ADMIN_CMD)); + } default: return (ENOTTY); } @@ -610,7 +623,6 @@ nvme_ns_construct(struct nvme_namespace *ns, uint32_t id, return (ENXIO); ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%d", device_get_nameunit(ctrlr->dev), ns->id); - ns->cdev->si_flags |= SI_UNMAPPED; return (0); From ad9cc86bf60cee2b35e804b348840a096f66561d Mon Sep 17 00:00:00 2001 From: Chuck Tuffli Date: Fri, 14 Jun 2024 16:40:20 -0600 Subject: [PATCH 90/91] linux: Translate Linux NVME ioctls to the lower layers. The lower layers implement a ABI compatible Linux ioctl for a few of the Linux IOCTLs. Translate them and pass them down. Since they are ABI compatible, just use the nvme ioctl name. Co-Authored-by: Warner Losh Reviewed by: chuck Differential Revision: https://reviews.freebsd.org/D45416 --- sys/compat/linux/linux_ioctl.c | 35 ++++++++++++++++++++++++++++++++++ sys/compat/linux/linux_ioctl.h | 14 ++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/sys/compat/linux/linux_ioctl.c b/sys/compat/linux/linux_ioctl.c index 41c43f1ef8e6..aa2c9ce7f273 100644 --- a/sys/compat/linux/linux_ioctl.c +++ b/sys/compat/linux/linux_ioctl.c @@ -83,6 +83,8 @@ #include +#include + #define DEFINE_LINUX_IOCTL_SET(shortname, SHORTNAME) \ static linux_ioctl_function_t linux_ioctl_ ## shortname; \ static struct linux_ioctl_handler shortname ## _handler = { \ @@ -108,6 +110,9 @@ DEFINE_LINUX_IOCTL_SET(v4l2, VIDEO2); DEFINE_LINUX_IOCTL_SET(fbsd_usb, FBSD_LUSB); DEFINE_LINUX_IOCTL_SET(evdev, EVDEV); DEFINE_LINUX_IOCTL_SET(kcov, KCOV); +#ifndef COMPAT_LINUX32 +DEFINE_LINUX_IOCTL_SET(nvme, NVME); +#endif #undef DEFINE_LINUX_IOCTL_SET @@ -3531,6 +3536,36 @@ linux_ioctl_kcov(struct thread *td, struct linux_ioctl_args *args) return (error); } +#ifndef COMPAT_LINUX32 +static int +linux_ioctl_nvme(struct thread *td, struct linux_ioctl_args *args) +{ + + /* + * The NVMe drivers for namespace and controller implement these + * commands using their native format. All the others are not + * implemented yet. + */ + switch (args->cmd & 0xffff) { + case LINUX_NVME_IOCTL_ID: + args->cmd = NVME_IOCTL_ID; + break; + case LINUX_NVME_IOCTL_RESET: + args->cmd = NVME_IOCTL_RESET; + break; + case LINUX_NVME_IOCTL_ADMIN_CMD: + args->cmd = NVME_IOCTL_ADMIN_CMD; + break; + case LINUX_NVME_IOCTL_IO_CMD: + args->cmd = NVME_IOCTL_IO_CMD; + break; + default: + return (ENODEV); + } + return (sys_ioctl(td, (struct ioctl_args *)args)); +} +#endif + /* * main ioctl syscall function */ diff --git a/sys/compat/linux/linux_ioctl.h b/sys/compat/linux/linux_ioctl.h index 8a56e35d10c6..4ef6d4f40830 100644 --- a/sys/compat/linux/linux_ioctl.h +++ b/sys/compat/linux/linux_ioctl.h @@ -781,6 +781,20 @@ #define LINUX_KCOV_DISABLE 0x6365 #define LINUX_KCOV_REMOTE_ENABLE 0x6366 +/* + * NVMe IOCTLs defined by Linux + */ +#define LINUX_NVME_IOCTL_ID 0x4e40 +#define LINUX_NVME_IOCTL_ADMIN_CMD 0x4e41 +#define LINUX_NVME_IOCTL_SUBMIT_IO 0x4e42 +#define LINUX_NVME_IOCTL_IO_CMD 0x4e43 +#define LINUX_NVME_IOCTL_RESET 0x4e44 +#define LINUX_NVME_IOCTL_SUBSYS_RESET 0x4e45 +#define LINUX_NVME_IOCTL_RESCAN 0x4e46 + +#define LINUX_IOCTL_NVME_MIN LINUX_NVME_IOCTL_ID +#define LINUX_IOCTL_NVME_MAX LINUX_NVME_IOCTL_RESCAN + /* * Pluggable ioctl handlers */ From e52eeb8fe3b2b79332aaf2546a65cfd0fd33cda5 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Mon, 17 Jun 2024 13:03:44 -0400 Subject: [PATCH 91/91] Revert "tzsetup: ask local/UTC question only on x86" I am unsure if AArch64 Windows systems keep UTC or local time in the RTC by default, so keep tzsetup consistent across architectures for now. This reverts commit aa6fb1d277be47c51abc309ac9305def0fce7f9d. Reported by: Mark Millard (cherry picked from commit c6030b380469f928c8cae87ed53bcb234cb3486f) --- usr.sbin/tzsetup/tzsetup.8 | 1 - usr.sbin/tzsetup/tzsetup.c | 25 ++++++++++--------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/usr.sbin/tzsetup/tzsetup.8 b/usr.sbin/tzsetup/tzsetup.8 index 4e70875ec74b..3fd463c31ee5 100644 --- a/usr.sbin/tzsetup/tzsetup.8 +++ b/usr.sbin/tzsetup/tzsetup.8 @@ -63,7 +63,6 @@ Skip the initial question about adjusting the clock if not set to UTC. will neither create nor delete .Pa /etc/wall_cmos_clock . On a newly installed system, the hardware clock will keep UTC. -This option is enabled automatically on non-x86 hardware. .El .Pp It is possible to short-circuit the menu system by specifying the diff --git a/usr.sbin/tzsetup/tzsetup.c b/usr.sbin/tzsetup/tzsetup.c index 6629dd81f250..617de4efb765 100644 --- a/usr.sbin/tzsetup/tzsetup.c +++ b/usr.sbin/tzsetup/tzsetup.c @@ -826,28 +826,23 @@ main(int argc, char **argv) char prompt[128]; int fd; #endif - int c, rv; - bool skiputc; - char *dztpath; -#if defined(__i386__) || defined(__amd64__) + int c, rv, skiputc; char vm_guest[16] = ""; size_t len = sizeof(vm_guest); + char *dztpath; - skiputc = false; - - /* Default skiputc to true for VM guests */ - if (sysctlbyname("kern.vm_guest", vm_guest, &len, NULL, 0) == 0 && - strcmp(vm_guest, "none") != 0) - skiputc = true; -#else - skiputc = true; -#endif dztpath = NULL; + skiputc = 0; #ifdef HAVE_BSDDIALOG setlocale(LC_ALL, ""); #endif + /* Default skiputc to 1 for VM guests */ + if (sysctlbyname("kern.vm_guest", vm_guest, &len, NULL, 0) == 0 && + strcmp(vm_guest, "none") != 0) + skiputc = 1; + while ((c = getopt(argc, argv, "C:d:nrs")) != -1) { switch (c) { case 'C': @@ -866,7 +861,7 @@ main(int argc, char **argv) #endif break; case 's': - skiputc = true; + skiputc = 1; break; default: usage(); @@ -956,7 +951,7 @@ main(int argc, char **argv) if (bsddialog_init() == BSDDIALOG_ERROR) errx(1, "Error bsddialog: %s\n", bsddialog_geterror()); - if (!skiputc) { + if (skiputc == 0) { snprintf(prompt, sizeof(prompt), "Is this machine's CMOS clock set to UTC? " "If it is set to local time,\n"