Print this page
5042 stop using deprecated atomic functions
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/sun4/vm/vm_dep.c
+++ new/usr/src/uts/sun4/vm/vm_dep.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /*
27 27 * UNIX machine dependent virtual memory support.
28 28 */
29 29
30 30 #include <sys/vm.h>
31 31 #include <sys/exec.h>
32 32
33 33 #include <sys/exechdr.h>
34 34 #include <vm/seg_kmem.h>
35 35 #include <sys/atomic.h>
36 36 #include <sys/archsystm.h>
37 37 #include <sys/machsystm.h>
38 38 #include <sys/kdi.h>
39 39 #include <sys/cpu_module.h>
40 40
41 41 #include <vm/hat_sfmmu.h>
42 42
43 43 #include <sys/memnode.h>
44 44
45 45 #include <sys/mem_config.h>
46 46 #include <sys/mem_cage.h>
47 47 #include <vm/vm_dep.h>
48 48 #include <vm/page.h>
49 49 #include <sys/platform_module.h>
50 50
51 51 /*
52 52 * These variables are set by module specific config routines.
53 53 * They are only set by modules which will use physical cache page coloring.
54 54 */
55 55 int do_pg_coloring = 0;
56 56
57 57 /*
58 58 * These variables can be conveniently patched at kernel load time to
59 59 * prevent do_pg_coloring from being enabled by
60 60 * module specific config routines.
61 61 */
62 62
63 63 int use_page_coloring = 1;
64 64
65 65 /*
66 66 * initialized by page_coloring_init()
67 67 */
68 68 extern uint_t page_colors;
69 69 extern uint_t page_colors_mask;
70 70 extern uint_t page_coloring_shift;
71 71 int cpu_page_colors;
72 72 uint_t vac_colors = 0;
73 73 uint_t vac_colors_mask = 0;
74 74
75 75 /* cpu specific coloring initialization */
76 76 extern void page_coloring_init_cpu();
77 77 #pragma weak page_coloring_init_cpu
78 78
79 79 /*
80 80 * get the ecache setsize for the current cpu.
81 81 */
82 82 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize)
83 83
84 84 plcnt_t plcnt; /* page list count */
85 85
86 86 /*
87 87 * This variable is set by the cpu module to contain the lowest
88 88 * address not affected by the SF_ERRATA_57 workaround. It should
89 89 * remain 0 if the workaround is not needed.
90 90 */
91 91 #if defined(SF_ERRATA_57)
92 92 caddr_t errata57_limit;
93 93 #endif
94 94
95 95 extern void page_relocate_hash(page_t *, page_t *);
96 96
97 97 /*
98 98 * these must be defined in platform specific areas
99 99 */
100 100 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
101 101 struct proc *, uint_t);
102 102 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
103 103 caddr_t, size_t, uint_t, struct lgrp *);
104 104 /*
105 105 * Convert page frame number to an OBMEM page frame number
106 106 * (i.e. put in the type bits -- zero for this implementation)
107 107 */
108 108 pfn_t
109 109 impl_obmem_pfnum(pfn_t pf)
110 110 {
111 111 return (pf);
112 112 }
113 113
114 114 /*
115 115 * Use physmax to determine the highest physical page of DRAM memory
116 116 * It is assumed that any physical addresses above physmax is in IO space.
117 117 * We don't bother checking the low end because we assume that memory space
118 118 * begins at physical page frame 0.
119 119 *
120 120 * Return 1 if the page frame is onboard DRAM memory, else 0.
121 121 * Returns 0 for nvram so it won't be cached.
122 122 */
123 123 int
124 124 pf_is_memory(pfn_t pf)
125 125 {
126 126 /* We must be IO space */
127 127 if (pf > physmax)
128 128 return (0);
129 129
130 130 /* We must be memory space */
131 131 return (1);
132 132 }
133 133
134 134 /*
135 135 * Handle a pagefault.
136 136 */
137 137 faultcode_t
138 138 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
139 139 {
140 140 struct as *as;
141 141 struct proc *p;
142 142 faultcode_t res;
143 143 caddr_t base;
144 144 size_t len;
145 145 int err;
146 146
147 147 if (INVALID_VADDR(addr))
148 148 return (FC_NOMAP);
149 149
150 150 if (iskernel) {
151 151 as = &kas;
152 152 } else {
153 153 p = curproc;
154 154 as = p->p_as;
155 155 #if defined(SF_ERRATA_57)
156 156 /*
157 157 * Prevent infinite loops due to a segment driver
158 158 * setting the execute permissions and the sfmmu hat
159 159 * silently ignoring them.
160 160 */
161 161 if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
162 162 addr < errata57_limit) {
163 163 res = FC_NOMAP;
164 164 goto out;
165 165 }
166 166 #endif
167 167 }
168 168
169 169 /*
170 170 * Dispatch pagefault.
171 171 */
172 172 res = as_fault(as->a_hat, as, addr, 1, type, rw);
173 173
174 174 /*
175 175 * If this isn't a potential unmapped hole in the user's
176 176 * UNIX data or stack segments, just return status info.
177 177 */
178 178 if (!(res == FC_NOMAP && iskernel == 0))
179 179 goto out;
180 180
181 181 /*
182 182 * Check to see if we happened to faulted on a currently unmapped
183 183 * part of the UNIX data or stack segments. If so, create a zfod
184 184 * mapping there and then try calling the fault routine again.
185 185 */
186 186 base = p->p_brkbase;
187 187 len = p->p_brksize;
188 188
189 189 if (addr < base || addr >= base + len) { /* data seg? */
190 190 base = (caddr_t)(p->p_usrstack - p->p_stksize);
191 191 len = p->p_stksize;
192 192 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */
193 193 /* not in either UNIX data or stack segments */
194 194 res = FC_NOMAP;
195 195 goto out;
196 196 }
197 197 }
198 198
199 199 /* the rest of this function implements a 3.X 4.X 5.X compatibility */
200 200 /* This code is probably not needed anymore */
201 201
202 202 /* expand the gap to the page boundaries on each side */
203 203 len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
204 204 ((uintptr_t)base & PAGEMASK);
205 205 base = (caddr_t)((uintptr_t)base & PAGEMASK);
206 206
207 207 as_rangelock(as);
208 208 as_purge(as);
209 209 if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
210 210 err = as_map(as, base, len, segvn_create, zfod_argsp);
211 211 as_rangeunlock(as);
212 212 if (err) {
213 213 res = FC_MAKE_ERR(err);
214 214 goto out;
215 215 }
216 216 } else {
217 217 /*
218 218 * This page is already mapped by another thread after we
219 219 * returned from as_fault() above. We just fallthrough
220 220 * as_fault() below.
221 221 */
222 222 as_rangeunlock(as);
223 223 }
224 224
225 225 res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
226 226
227 227 out:
228 228
229 229 return (res);
230 230 }
231 231
232 232 /*
233 233 * This is the routine which defines the address limit implied
234 234 * by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest
235 235 * mappable address in a 32-bit process on this platform (though
236 236 * perhaps we should make it be UINT32_MAX here?)
237 237 */
238 238 void
239 239 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
240 240 {
241 241 struct proc *p = curproc;
242 242 caddr_t userlimit = flags & _MAP_LOW32 ?
243 243 (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
244 244 map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
245 245 }
246 246
247 247 /*
248 248 * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
249 249 */
250 250 caddr_t hole_start, hole_end;
251 251
252 252 /*
253 253 * kpm mapping window
254 254 */
255 255 caddr_t kpm_vbase;
256 256 size_t kpm_size;
257 257 uchar_t kpm_size_shift;
258 258
259 259 int valid_va_range_aligned_wraparound;
260 260 /*
261 261 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
262 262 * addresses at least "minlen" long, where the base of the range is at "off"
263 263 * phase from an "align" boundary and there is space for a "redzone"-sized
264 264 * redzone on either side of the range. On success, 1 is returned and *basep
265 265 * and *lenp are adjusted to describe the acceptable range (including
266 266 * the redzone). On failure, 0 is returned.
267 267 */
268 268 int
269 269 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
270 270 size_t align, size_t redzone, size_t off)
271 271 {
272 272 caddr_t hi, lo;
273 273 size_t tot_len;
274 274
275 275 ASSERT(align == 0 ? off == 0 : off < align);
276 276 ASSERT(ISP2(align));
277 277 ASSERT(align == 0 || align >= PAGESIZE);
278 278
279 279 lo = *basep;
280 280 hi = lo + *lenp;
281 281 tot_len = minlen + 2 * redzone; /* need at least this much space */
282 282
283 283 /* If hi rolled over the top try cutting back. */
284 284 if (hi < lo) {
285 285 *lenp = 0UL - (uintptr_t)lo - 1UL;
286 286 /* Trying to see if this really happens, and then if so, why */
287 287 valid_va_range_aligned_wraparound++;
288 288 hi = lo + *lenp;
289 289 }
290 290 if (*lenp < tot_len) {
291 291 return (0);
292 292 }
293 293
294 294 /*
295 295 * Deal with a possible hole in the address range between
296 296 * hole_start and hole_end that should never be mapped by the MMU.
297 297 */
298 298
299 299 if (lo < hole_start) {
300 300 if (hi > hole_start)
301 301 if (hi < hole_end)
302 302 hi = hole_start;
303 303 else
304 304 /* lo < hole_start && hi >= hole_end */
305 305 if (dir == AH_LO) {
306 306 /*
307 307 * prefer lowest range
308 308 */
309 309 if (hole_start - lo >= tot_len)
310 310 hi = hole_start;
311 311 else if (hi - hole_end >= tot_len)
312 312 lo = hole_end;
313 313 else
314 314 return (0);
315 315 } else {
316 316 /*
317 317 * prefer highest range
318 318 */
319 319 if (hi - hole_end >= tot_len)
320 320 lo = hole_end;
321 321 else if (hole_start - lo >= tot_len)
322 322 hi = hole_start;
323 323 else
324 324 return (0);
325 325 }
326 326 } else {
327 327 /* lo >= hole_start */
328 328 if (hi < hole_end)
329 329 return (0);
330 330 if (lo < hole_end)
331 331 lo = hole_end;
332 332 }
333 333
334 334 /* Check if remaining length is too small */
335 335 if (hi - lo < tot_len) {
336 336 return (0);
337 337 }
338 338 if (align > 1) {
339 339 caddr_t tlo = lo + redzone;
340 340 caddr_t thi = hi - redzone;
341 341 tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
342 342 if (tlo < lo + redzone) {
343 343 return (0);
344 344 }
345 345 if (thi < tlo || thi - tlo < minlen) {
346 346 return (0);
347 347 }
348 348 }
349 349 *basep = lo;
350 350 *lenp = hi - lo;
351 351 return (1);
352 352 }
353 353
354 354 /*
355 355 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
356 356 * addresses at least "minlen" long. On success, 1 is returned and *basep
357 357 * and *lenp are adjusted to describe the acceptable range. On failure, 0
358 358 * is returned.
359 359 */
360 360 int
361 361 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
362 362 {
363 363 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
364 364 }
365 365
366 366 /*
367 367 * Determine whether [addr, addr+len] with protections `prot' are valid
368 368 * for a user address space.
369 369 */
370 370 /*ARGSUSED*/
371 371 int
372 372 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
373 373 caddr_t userlimit)
374 374 {
375 375 caddr_t eaddr = addr + len;
376 376
377 377 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
378 378 return (RANGE_BADADDR);
379 379
380 380 /*
381 381 * Determine if the address range falls within an illegal
382 382 * range of the MMU.
383 383 */
384 384 if (eaddr > hole_start && addr < hole_end)
385 385 return (RANGE_BADADDR);
386 386
387 387 #if defined(SF_ERRATA_57)
388 388 /*
389 389 * Make sure USERLIMIT isn't raised too high
390 390 */
391 391 ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
392 392 errata57_limit == 0);
393 393
394 394 if (AS_TYPE_64BIT(as) &&
395 395 (addr < errata57_limit) &&
396 396 (prot & PROT_EXEC))
397 397 return (RANGE_BADPROT);
398 398 #endif /* SF_ERRATA57 */
399 399 return (RANGE_OKAY);
400 400 }
401 401
402 402 /*
403 403 * Routine used to check to see if an a.out can be executed
404 404 * by the current machine/architecture.
405 405 */
406 406 int
407 407 chkaout(struct exdata *exp)
408 408 {
409 409 if (exp->ux_mach == M_SPARC)
410 410 return (0);
411 411 else
412 412 return (ENOEXEC);
413 413 }
414 414
415 415 /*
416 416 * The following functions return information about an a.out
417 417 * which is used when a program is executed.
418 418 */
419 419
420 420 /*
421 421 * Return the load memory address for the data segment.
422 422 */
423 423 caddr_t
424 424 getdmem(struct exec *exp)
425 425 {
426 426 /*
427 427 * XXX - Sparc Reference Hack approaching
428 428 * Remember that we are loading
429 429 * 8k executables into a 4k machine
430 430 * DATA_ALIGN == 2 * PAGESIZE
431 431 */
432 432 if (exp->a_text)
433 433 return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
434 434 else
435 435 return ((caddr_t)USRTEXT);
436 436 }
437 437
438 438 /*
439 439 * Return the starting disk address for the data segment.
440 440 */
441 441 ulong_t
442 442 getdfile(struct exec *exp)
443 443 {
444 444 if (exp->a_magic == ZMAGIC)
445 445 return (exp->a_text);
446 446 else
447 447 return (sizeof (struct exec) + exp->a_text);
448 448 }
449 449
450 450 /*
451 451 * Return the load memory address for the text segment.
452 452 */
453 453
454 454 /*ARGSUSED*/
455 455 caddr_t
456 456 gettmem(struct exec *exp)
457 457 {
458 458 return ((caddr_t)USRTEXT);
459 459 }
460 460
461 461 /*
462 462 * Return the file byte offset for the text segment.
463 463 */
464 464 uint_t
465 465 gettfile(struct exec *exp)
466 466 {
467 467 if (exp->a_magic == ZMAGIC)
468 468 return (0);
469 469 else
470 470 return (sizeof (struct exec));
471 471 }
472 472
473 473 void
474 474 getexinfo(
475 475 struct exdata *edp_in,
476 476 struct exdata *edp_out,
477 477 int *pagetext,
478 478 int *pagedata)
479 479 {
480 480 *edp_out = *edp_in; /* structure copy */
481 481
482 482 if ((edp_in->ux_mag == ZMAGIC) &&
483 483 ((edp_in->vp->v_flag & VNOMAP) == 0)) {
484 484 *pagetext = 1;
485 485 *pagedata = 1;
486 486 } else {
487 487 *pagetext = 0;
488 488 *pagedata = 0;
489 489 }
490 490 }
491 491
492 492 /*
493 493 * Return non 0 value if the address may cause a VAC alias with KPM mappings.
494 494 * KPM selects an address such that it's equal offset modulo shm_alignment and
495 495 * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
496 496 */
497 497 int
498 498 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
499 499 {
500 500 if (vac) {
501 501 return (((uintptr_t)addr ^ off) & shm_alignment - 1);
502 502 } else {
503 503 return (0);
504 504 }
505 505 }
506 506
507 507 /*
508 508 * Sanity control. Don't use large pages regardless of user
509 509 * settings if there's less than priv or shm_lpg_min_physmem memory installed.
510 510 * The units for this variable is 8K pages.
511 511 */
512 512 pgcnt_t shm_lpg_min_physmem = 131072; /* 1GB */
513 513 pgcnt_t privm_lpg_min_physmem = 131072; /* 1GB */
514 514
515 515 static size_t
516 516 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
517 517 {
518 518 size_t pgsz = MMU_PAGESIZE;
519 519 int szc;
520 520
521 521 /*
522 522 * If len is zero, retrieve from proc and don't demote the page size.
523 523 * Use atleast the default pagesize.
524 524 */
525 525 if (len == 0) {
526 526 len = p->p_brkbase + p->p_brksize - p->p_bssbase;
527 527 }
528 528 len = MAX(len, default_uheap_lpsize);
529 529
530 530 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
531 531 pgsz = hw_page_array[szc].hp_size;
532 532 if ((disable_auto_data_large_pages & (1 << szc)) ||
533 533 pgsz > max_uheap_lpsize)
534 534 continue;
535 535 if (len >= pgsz) {
536 536 break;
537 537 }
538 538 }
539 539
540 540 /*
541 541 * If addr == 0 we were called by memcntl() when the
542 542 * size code is 0. Don't set pgsz less than current size.
543 543 */
544 544 if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
545 545 pgsz = hw_page_array[p->p_brkpageszc].hp_size;
546 546 }
547 547
548 548 return (pgsz);
549 549 }
550 550
551 551 static size_t
552 552 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
553 553 {
554 554 size_t pgsz = MMU_PAGESIZE;
555 555 int szc;
556 556
557 557 /*
558 558 * If len is zero, retrieve from proc and don't demote the page size.
559 559 * Use atleast the default pagesize.
560 560 */
561 561 if (len == 0) {
562 562 len = p->p_stksize;
563 563 }
564 564 len = MAX(len, default_ustack_lpsize);
565 565
566 566 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
567 567 pgsz = hw_page_array[szc].hp_size;
568 568 if ((disable_auto_data_large_pages & (1 << szc)) ||
569 569 pgsz > max_ustack_lpsize)
570 570 continue;
571 571 if (len >= pgsz) {
572 572 break;
573 573 }
574 574 }
575 575
576 576 /*
577 577 * If addr == 0 we were called by memcntl() or exec_args() when the
578 578 * size code is 0. Don't set pgsz less than current size.
579 579 */
580 580 if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
581 581 pgsz = hw_page_array[p->p_stkpageszc].hp_size;
582 582 }
583 583
584 584 return (pgsz);
585 585 }
586 586
587 587 static size_t
588 588 map_pgszism(caddr_t addr, size_t len)
589 589 {
590 590 uint_t szc;
591 591 size_t pgsz;
592 592
593 593 for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
594 594 if (disable_ism_large_pages & (1 << szc))
595 595 continue;
596 596
597 597 pgsz = hw_page_array[szc].hp_size;
598 598 if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
599 599 return (pgsz);
600 600 }
601 601
602 602 return (DEFAULT_ISM_PAGESIZE);
603 603 }
604 604
605 605 /*
606 606 * Suggest a page size to be used to map a segment of type maptype and length
607 607 * len. Returns a page size (not a size code).
608 608 */
609 609 /* ARGSUSED */
610 610 size_t
611 611 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
612 612 {
613 613 size_t pgsz = MMU_PAGESIZE;
614 614
615 615 ASSERT(maptype != MAPPGSZ_VA);
616 616
617 617 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
618 618 return (MMU_PAGESIZE);
619 619 }
620 620
621 621 switch (maptype) {
622 622 case MAPPGSZ_ISM:
623 623 pgsz = map_pgszism(addr, len);
624 624 break;
625 625
626 626 case MAPPGSZ_STK:
627 627 if (max_ustack_lpsize > MMU_PAGESIZE) {
628 628 pgsz = map_pgszstk(p, addr, len);
629 629 }
630 630 break;
631 631
632 632 case MAPPGSZ_HEAP:
633 633 if (max_uheap_lpsize > MMU_PAGESIZE) {
634 634 pgsz = map_pgszheap(p, addr, len);
635 635 }
636 636 break;
637 637 }
638 638 return (pgsz);
639 639 }
640 640
641 641
642 642 /* assumes TTE8K...TTE4M == szc */
643 643
644 644 static uint_t
645 645 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
646 646 size_t max_lpsize, size_t min_physmem)
647 647 {
648 648 caddr_t eaddr = addr + size;
649 649 uint_t szcvec = 0;
650 650 caddr_t raddr;
651 651 caddr_t readdr;
652 652 size_t pgsz;
653 653 int i;
654 654
655 655 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
656 656 return (0);
657 657 }
658 658 for (i = mmu_page_sizes - 1; i > 0; i--) {
659 659 if (disable_lpgs & (1 << i)) {
660 660 continue;
661 661 }
662 662 pgsz = page_get_pagesize(i);
663 663 if (pgsz > max_lpsize) {
664 664 continue;
665 665 }
666 666 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
667 667 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
668 668 if (raddr < addr || raddr >= readdr) {
669 669 continue;
670 670 }
671 671 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
672 672 continue;
673 673 }
674 674 szcvec |= (1 << i);
675 675 /*
676 676 * And or in the remaining enabled page sizes.
677 677 */
678 678 szcvec |= P2PHASE(~disable_lpgs, (1 << i));
679 679 szcvec &= ~1; /* no need to return 8K pagesize */
680 680 break;
681 681 }
682 682 return (szcvec);
683 683 }
684 684
685 685 /*
686 686 * Return a bit vector of large page size codes that
687 687 * can be used to map [addr, addr + len) region.
688 688 */
689 689 /* ARGSUSED */
690 690 uint_t
691 691 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
692 692 int memcntl)
693 693 {
694 694 if (flags & MAP_TEXT) {
695 695 return (map_szcvec(addr, size, off,
696 696 disable_auto_text_large_pages,
697 697 max_utext_lpsize, shm_lpg_min_physmem));
698 698
699 699 } else if (flags & MAP_INITDATA) {
700 700 return (map_szcvec(addr, size, off,
701 701 disable_auto_data_large_pages,
702 702 max_uidata_lpsize, privm_lpg_min_physmem));
703 703
704 704 } else if (type == MAPPGSZC_SHM) {
705 705 return (map_szcvec(addr, size, off,
706 706 disable_auto_data_large_pages,
707 707 max_shm_lpsize, shm_lpg_min_physmem));
708 708
709 709 } else if (type == MAPPGSZC_HEAP) {
710 710 return (map_szcvec(addr, size, off,
711 711 disable_auto_data_large_pages,
712 712 max_uheap_lpsize, privm_lpg_min_physmem));
713 713
714 714 } else if (type == MAPPGSZC_STACK) {
715 715 return (map_szcvec(addr, size, off,
716 716 disable_auto_data_large_pages,
717 717 max_ustack_lpsize, privm_lpg_min_physmem));
718 718
719 719 } else {
720 720 return (map_szcvec(addr, size, off,
721 721 disable_auto_data_large_pages,
722 722 max_privmap_lpsize, privm_lpg_min_physmem));
723 723 }
724 724 }
725 725
726 726 /*
727 727 * Anchored in the table below are counters used to keep track
728 728 * of free contiguous physical memory. Each element of the table contains
729 729 * the array of counters, the size of array which is allocated during
730 730 * startup based on physmax and a shift value used to convert a pagenum
731 731 * into a counter array index or vice versa. The table has page size
732 732 * for rows and region size for columns:
733 733 *
734 734 * page_counters[page_size][region_size]
735 735 *
736 736 * page_size: TTE size code of pages on page_size freelist.
737 737 *
738 738 * region_size: TTE size code of a candidate larger page made up
739 739 * made up of contiguous free page_size pages.
740 740 *
741 741 * As you go across a page_size row increasing region_size each
742 742 * element keeps track of how many (region_size - 1) size groups
743 743 * made up of page_size free pages can be coalesced into a
744 744 * regsion_size page. Yuck! Lets try an example:
745 745 *
746 746 * page_counters[1][3] is the table element used for identifying
747 747 * candidate 4M pages from contiguous pages off the 64K free list.
748 748 * Each index in the page_counters[1][3].array spans 4M. Its the
749 749 * number of free 512K size (regsion_size - 1) groups of contiguous
750 750 * 64K free pages. So when page_counters[1][3].counters[n] == 8
751 751 * we know we have a candidate 4M page made up of 512K size groups
752 752 * of 64K free pages.
753 753 */
754 754
755 755 /*
756 756 * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
757 757 * dimensions are allocated dynamically.
758 758 */
759 759 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
760 760
761 761 /*
762 762 * For now there is only a single size cache list.
763 763 * Allocated dynamically.
764 764 */
765 765 page_t ***page_cachelists[MAX_MEM_TYPES];
766 766
767 767 kmutex_t *fpc_mutex[NPC_MUTEX];
768 768 kmutex_t *cpc_mutex[NPC_MUTEX];
769 769
770 770 /*
771 771 * Calculate space needed for page freelists and counters
772 772 */
773 773 size_t
774 774 calc_free_pagelist_sz(void)
775 775 {
776 776 int szc;
777 777 size_t alloc_sz, cache_sz, free_sz;
778 778
779 779 /*
780 780 * one cachelist per color, node, and type
781 781 */
782 782 cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
783 783 sizeof (page_t **);
784 784 cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
785 785
786 786 /*
787 787 * one freelist per size, color, node, and type
788 788 */
789 789 free_sz = sizeof (page_t **);
790 790 for (szc = 0; szc < mmu_page_sizes; szc++)
791 791 free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
792 792 free_sz *= max_mem_nodes * MAX_MEM_TYPES;
793 793
794 794 alloc_sz = cache_sz + free_sz + page_ctrs_sz();
795 795 return (alloc_sz);
796 796 }
797 797
798 798 caddr_t
799 799 alloc_page_freelists(caddr_t alloc_base)
800 800 {
801 801 int mnode, mtype;
802 802 int szc, clrs;
803 803
804 804 /*
805 805 * We only support small pages in the cachelist.
806 806 */
807 807 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
808 808 page_cachelists[mtype] = (page_t ***)alloc_base;
809 809 alloc_base += (max_mem_nodes * sizeof (page_t **));
810 810 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
811 811 page_cachelists[mtype][mnode] = (page_t **)alloc_base;
812 812 alloc_base +=
813 813 (page_get_pagecolors(0) * sizeof (page_t *));
814 814 }
815 815 }
816 816
817 817 /*
818 818 * Allocate freelists bins for all
819 819 * supported page sizes.
820 820 */
821 821 for (szc = 0; szc < mmu_page_sizes; szc++) {
822 822 clrs = page_get_pagecolors(szc);
823 823 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
824 824 page_freelists[szc][mtype] = (page_t ***)alloc_base;
825 825 alloc_base += (max_mem_nodes * sizeof (page_t **));
826 826 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
827 827 page_freelists[szc][mtype][mnode] =
828 828 (page_t **)alloc_base;
829 829 alloc_base += (clrs * (sizeof (page_t *)));
830 830 }
831 831 }
832 832 }
833 833
834 834 alloc_base = page_ctrs_alloc(alloc_base);
835 835 return (alloc_base);
836 836 }
837 837
838 838 /*
839 839 * Allocate page_freelists locks for a memnode from the nucleus data
840 840 * area. This is the first time that mmu_page_sizes is used during
841 841 * bootup, so check mmu_page_sizes initialization.
842 842 */
843 843 int
844 844 ndata_alloc_page_mutexs(struct memlist *ndata)
845 845 {
846 846 size_t alloc_sz;
847 847 caddr_t alloc_base;
848 848 int i;
849 849 void page_coloring_init();
850 850
851 851 page_coloring_init();
852 852 if (&mmu_init_mmu_page_sizes) {
853 853 if (!mmu_init_mmu_page_sizes(0)) {
854 854 cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
855 855 mmu_page_sizes);
856 856 }
857 857 }
858 858 ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
859 859
860 860 /* fpc_mutex and cpc_mutex */
861 861 alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
862 862
863 863 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
864 864 if (alloc_base == NULL)
865 865 return (-1);
866 866
867 867 ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
868 868
869 869 for (i = 0; i < NPC_MUTEX; i++) {
870 870 fpc_mutex[i] = (kmutex_t *)alloc_base;
871 871 alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
872 872 cpc_mutex[i] = (kmutex_t *)alloc_base;
873 873 alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
874 874 }
875 875 return (0);
876 876 }
877 877
878 878 /*
879 879 * To select our starting bin, we stride through the bins with a stride
880 880 * of 337. Why 337? It's prime, it's largeish, and it performs well both
881 881 * in simulation and practice for different workloads on varying cache sizes.
882 882 */
883 883 uint32_t color_start_current = 0;
884 884 uint32_t color_start_stride = 337;
885 885 int color_start_random = 0;
886 886
887 887 /* ARGSUSED */
888 888 uint_t
889 889 get_color_start(struct as *as)
890 890 {
↓ open down ↓ |
890 lines elided |
↑ open up ↑ |
891 891 uint32_t old, new;
892 892
893 893 if (consistent_coloring == 2 || color_start_random) {
894 894 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
895 895 (hw_page_array[0].hp_colors - 1)));
896 896 }
897 897
898 898 do {
899 899 old = color_start_current;
900 900 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
901 - } while (cas32(&color_start_current, old, new) != old);
901 + } while (atomic_cas_32(&color_start_current, old, new) != old);
902 902
903 903 return ((uint_t)(new));
904 904 }
905 905
906 906 /*
907 907 * Called once at startup from kphysm_init() -- before memialloc()
908 908 * is invoked to do the 1st page_free()/page_freelist_add().
909 909 *
910 910 * initializes page_colors and page_colors_mask based on ecache_setsize.
911 911 *
912 912 * Also initializes the counter locks.
913 913 */
914 914 void
915 915 page_coloring_init()
916 916 {
917 917 int a, i;
918 918 uint_t colors;
919 919
920 920 if (do_pg_coloring == 0) {
921 921 page_colors = 1;
922 922 for (i = 0; i < mmu_page_sizes; i++) {
923 923 colorequivszc[i] = 0;
924 924 hw_page_array[i].hp_colors = 1;
925 925 }
926 926 return;
927 927 }
928 928
929 929 /*
930 930 * Calculate page_colors from ecache_setsize. ecache_setsize contains
931 931 * the max ecache setsize of all cpus configured in the system or, for
932 932 * cheetah+ systems, the max possible ecache setsize for all possible
933 933 * cheetah+ cpus.
934 934 */
935 935 page_colors = ecache_setsize / MMU_PAGESIZE;
936 936 page_colors_mask = page_colors - 1;
937 937
938 938 vac_colors = vac_size / MMU_PAGESIZE;
939 939 vac_colors_mask = vac_colors -1;
940 940
941 941 page_coloring_shift = 0;
942 942 a = ecache_setsize;
943 943 while (a >>= 1) {
944 944 page_coloring_shift++;
945 945 }
946 946
947 947 /* initialize number of colors per page size */
948 948 for (i = 0; i < mmu_page_sizes; i++) {
949 949 hw_page_array[i].hp_colors = (page_colors_mask >>
950 950 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
951 951 + 1;
952 952 colorequivszc[i] = 0;
953 953 }
954 954
955 955 /*
956 956 * initialize cpu_page_colors if ecache setsizes are homogenous.
957 957 * cpu_page_colors set to -1 during DR operation or during startup
958 958 * if setsizes are heterogenous.
959 959 *
960 960 * The value of cpu_page_colors determines if additional color bins
961 961 * need to be checked for a particular color in the page_get routines.
962 962 */
963 963 if (cpu_setsize > 0 && cpu_page_colors == 0 &&
964 964 cpu_setsize < ecache_setsize) {
965 965 cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
966 966 a = lowbit(page_colors) - lowbit(cpu_page_colors);
967 967 ASSERT(a > 0);
968 968 ASSERT(a < 16);
969 969
970 970 for (i = 0; i < mmu_page_sizes; i++) {
971 971 if ((colors = hw_page_array[i].hp_colors) <= 1) {
972 972 continue;
973 973 }
974 974 while ((colors >> a) == 0)
975 975 a--;
976 976 ASSERT(a >= 0);
977 977
978 978 /* higher 4 bits encodes color equiv mask */
979 979 colorequivszc[i] = (a << 4);
980 980 }
981 981 }
982 982
983 983 /* do cpu specific color initialization */
984 984 if (&page_coloring_init_cpu) {
985 985 page_coloring_init_cpu();
986 986 }
987 987 }
988 988
989 989 int
990 990 bp_color(struct buf *bp)
991 991 {
992 992 int color = -1;
993 993
994 994 if (vac) {
995 995 if ((bp->b_flags & B_PAGEIO) != 0) {
996 996 color = sfmmu_get_ppvcolor(bp->b_pages);
997 997 } else if (bp->b_un.b_addr != NULL) {
998 998 color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
999 999 }
1000 1000 }
1001 1001 return (color < 0 ? 0 : ptob(color));
1002 1002 }
1003 1003
1004 1004 /*
1005 1005 * Function for flushing D-cache when performing module relocations
1006 1006 * to an alternate mapping. Stubbed out on all platforms except sun4u,
1007 1007 * at least for now.
1008 1008 */
1009 1009 void
1010 1010 dcache_flushall()
1011 1011 {
1012 1012 sfmmu_cache_flushall();
1013 1013 }
1014 1014
1015 1015 static int
1016 1016 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1017 1017 {
1018 1018 if (va1 < va2 && va1 + sz1 <= va2)
1019 1019 return (0);
1020 1020
1021 1021 if (va2 < va1 && va2 + sz2 <= va1)
1022 1022 return (0);
1023 1023
1024 1024 return (1);
1025 1025 }
1026 1026
1027 1027 /*
1028 1028 * Return the number of bytes, relative to the beginning of a given range, that
1029 1029 * are non-toxic (can be read from and written to with relative impunity).
1030 1030 */
1031 1031 size_t
1032 1032 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1033 1033 {
1034 1034 /* OBP reads are harmless, but we don't want people writing there */
1035 1035 if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1036 1036 OFW_START_ADDR + 1))
1037 1037 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1038 1038
1039 1039 if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1040 1040 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1041 1041
1042 1042 return (sz); /* no overlap */
1043 1043 }
1044 1044
1045 1045 /*
1046 1046 * Minimum physmem required for enabling large pages for kernel heap
1047 1047 * Currently we do not enable lp for kmem on systems with less
1048 1048 * than 1GB of memory. This value can be changed via /etc/system
1049 1049 */
1050 1050 size_t segkmem_lpminphysmem = 0x40000000; /* 1GB */
1051 1051
1052 1052 /*
1053 1053 * this function chooses large page size for kernel heap
1054 1054 */
1055 1055 size_t
1056 1056 get_segkmem_lpsize(size_t lpsize)
1057 1057 {
1058 1058 size_t memtotal = physmem * PAGESIZE;
1059 1059 size_t mmusz;
1060 1060 uint_t szc;
1061 1061
1062 1062 if (memtotal < segkmem_lpminphysmem)
1063 1063 return (PAGESIZE);
1064 1064
1065 1065 if (plat_lpkmem_is_supported != NULL &&
1066 1066 plat_lpkmem_is_supported() == 0)
1067 1067 return (PAGESIZE);
1068 1068
1069 1069 mmusz = mmu_get_kernel_lpsize(lpsize);
1070 1070 szc = page_szc(mmusz);
1071 1071
1072 1072 while (szc) {
1073 1073 if (!(disable_large_pages & (1 << szc)))
1074 1074 return (page_get_pagesize(szc));
1075 1075 szc--;
1076 1076 }
1077 1077 return (PAGESIZE);
1078 1078 }
↓ open down ↓ |
167 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX