Discussion:
[PATCH 6/6] freepgt: hugetlb area is clean
(too old to reply)
Hugh Dickins
2005-03-23 17:35:06 UTC
Permalink
Once we're strict about clearing away page tables, hugetlb_prefault can
assume there are no page tables left within its range. Since the other
arches continue if !pte_none here, let i386 do the same.

Signed-off-by: Hugh Dickins <***@veritas.com>
---

arch/i386/mm/hugetlbpage.c | 11 ++---------
arch/ppc64/mm/hugetlbpage.c | 37 -------------------------------------
2 files changed, 2 insertions(+), 46 deletions(-)

--- freepgt5/arch/i386/mm/hugetlbpage.c 2005-03-18 10:22:18.000000000 +0000
+++ freepgt6/arch/i386/mm/hugetlbpage.c 2005-03-23 15:41:23.000000000 +0000
@@ -246,15 +246,8 @@ int hugetlb_prefault(struct address_spac
goto out;
}

- if (!pte_none(*pte)) {
- pmd_t *pmd = (pmd_t *) pte;
-
- page = pmd_page(*pmd);
- pmd_clear(pmd);
- mm->nr_ptes--;
- dec_page_state(nr_page_table_pages);
- page_cache_release(page);
- }
+ if (!pte_none(*pte))
+ continue;

idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
--- freepgt5/arch/ppc64/mm/hugetlbpage.c 2005-03-21 19:07:01.000000000 +0000
+++ freepgt6/arch/ppc64/mm/hugetlbpage.c 2005-03-23 15:41:23.000000000 +0000
@@ -203,8 +203,6 @@ static int prepare_low_seg_for_htlb(stru
unsigned long start = seg << SID_SHIFT;
unsigned long end = (seg+1) << SID_SHIFT;
struct vm_area_struct *vma;
- unsigned long addr;
- struct mmu_gather *tlb;

BUG_ON(seg >= 16);

@@ -213,41 +211,6 @@ static int prepare_low_seg_for_htlb(stru
if (vma && (vma->vm_start < end))
return -EBUSY;

- /* Clean up any leftover PTE pages in the region */
- spin_lock(&mm->page_table_lock);
- tlb = tlb_gather_mmu(mm, 0);
- for (addr = start; addr < end; addr += PMD_SIZE) {
- pgd_t *pgd = pgd_offset(mm, addr);
- pmd_t *pmd;
- struct page *page;
- pte_t *pte;
- int i;
-
- if (pgd_none(*pgd))
- continue;
- pmd = pmd_offset(pgd, addr);
- if (!pmd || pmd_none(*pmd))
- continue;
- if (pmd_bad(*pmd)) {
- pmd_ERROR(*pmd);
- pmd_clear(pmd);
- continue;
- }
- pte = (pte_t *)pmd_page_kernel(*pmd);
- /* No VMAs, so there should be no PTEs, check just in case. */
- for (i = 0; i < PTRS_PER_PTE; i++) {
- BUG_ON(!pte_none(*pte));
- pte++;
- }
- page = pmd_page(*pmd);
- pmd_clear(pmd);
- mm->nr_ptes--;
- dec_page_state(nr_page_table_pages);
- pte_free_tlb(tlb, page);
- }
- tlb_finish_mmu(tlb, start, end);
- spin_unlock(&mm->page_table_lock);
-
return 0;
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Hugh Dickins
2005-03-23 17:31:02 UTC
Permalink
While dabbling here in mmap.c, clean up mysterious "mpnt"s to "vma"s.

Signed-off-by: Hugh Dickins <***@veritas.com>
---

mm/mmap.c | 35 +++++++++++++++++------------------
1 files changed, 17 insertions(+), 18 deletions(-)

--- freepgt4/mm/mmap.c 2005-03-21 19:06:48.000000000 +0000
+++ freepgt5/mm/mmap.c 2005-03-21 19:07:25.000000000 +0000
@@ -1602,14 +1602,13 @@ static void unmap_vma(struct mm_struct *
* Ok - we have the memory areas we should free on the 'free' list,
* so release them, and do the vma updates.
*/
-static void unmap_vma_list(struct mm_struct *mm,
- struct vm_area_struct *mpnt)
+static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
do {
- struct vm_area_struct *next = mpnt->vm_next;
- unmap_vma(mm, mpnt);
- mpnt = next;
- } while (mpnt != NULL);
+ struct vm_area_struct *next = vma->vm_next;
+ unmap_vma(mm, vma);
+ vma = next;
+ } while (vma);
validate_mm(mm);
}

@@ -1720,7 +1719,7 @@ int split_vma(struct mm_struct * mm, str
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
{
unsigned long end;
- struct vm_area_struct *mpnt, *prev, *last;
+ struct vm_area_struct *vma, *prev, *last;

if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
@@ -1729,14 +1728,14 @@ int do_munmap(struct mm_struct *mm, unsi
return -EINVAL;

/* Find the first overlapping VMA */
- mpnt = find_vma_prev(mm, start, &prev);
- if (!mpnt)
+ vma = find_vma_prev(mm, start, &prev);
+ if (!vma)
return 0;
- /* we have start < mpnt->vm_end */
+ /* we have start < vma->vm_end */

/* if it doesn't overlap, we have nothing.. */
end = start + len;
- if (mpnt->vm_start >= end)
+ if (vma->vm_start >= end)
return 0;

/*
@@ -1746,11 +1745,11 @@ int do_munmap(struct mm_struct *mm, unsi
* unmapped vm_area_struct will remain in use: so lower split_vma
* places tmp vma above, and higher split_vma places tmp vma below.
*/
- if (start > mpnt->vm_start) {
- int error = split_vma(mm, mpnt, start, 0);
+ if (start > vma->vm_start) {
+ int error = split_vma(mm, vma, start, 0);
if (error)
return error;
- prev = mpnt;
+ prev = vma;
}

/* Does it split the last one? */
@@ -1760,16 +1759,16 @@ int do_munmap(struct mm_struct *mm, unsi
if (error)
return error;
}
- mpnt = prev? prev->vm_next: mm->mmap;
+ vma = prev? prev->vm_next: mm->mmap;

/*
* Remove the vma's, and unmap the actual pages
*/
- detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
- unmap_region(mm, mpnt, prev, start, end);
+ detach_vmas_to_be_unmapped(mm, vma, prev, end);
+ unmap_region(mm, vma, prev, start, end);

/* Fix up all other VM information */
- unmap_vma_list(mm, mpnt);
+ unmap_vma_list(mm, vma);

return 0;
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Hugh Dickins
2005-03-23 17:20:46 UTC
Permalink
ia64 and ppc64 had hugetlb_free_pgtables functions which were no longer
being called, and it wasn't obvious what to do about them.

The ppc64 case turns out to be easy: the associated tables are noted
elsewhere and freed later, safe to either skip its hugetlb areas or go
through the motions of freeing nothing. Since ia64 does need a special
case, restore to ppc64 the special case of skipping them.

The ia64 hugetlb case has been broken since pgd_addr_end went in, though
it probably appeared to work okay if you just had one such area; in fact
it's been broken much longer if you consider a long munmap spanning from
another region into the hugetlb region.

In the ia64 hugetlb region, more virtual address bits are available than
in the other regions, yet the page tables are structured the same way:
the page at the bottom is larger. Here we need to scale down each addr
before passing it to the standard free_pgd_range. Was about to write a
hugely_scaled_down macro, but found htlbpage_to_page already exists for
just this purpose. Fixed off-by-one in ia64 is_hugepage_only_range.

Uninline free_pgd_range to make it available to ia64. Make sure the
vma-gathering loop in free_pgtables cannot join a hugepage_only_range
to any other (safe to join huges? probably but don't bother).

Signed-off-by: Hugh Dickins <***@veritas.com>
---

arch/ia64/mm/hugetlbpage.c | 29 +++++++++++++++++++++++------
arch/ppc64/mm/hugetlbpage.c | 10 ----------
include/asm-ia64/page.h | 2 +-
include/asm-ia64/pgtable.h | 4 ++--
include/asm-ppc64/pgtable.h | 12 +++++++++---
include/linux/hugetlb.h | 6 ++++--
include/linux/mm.h | 4 +++-
mm/memory.c | 30 +++++++++++++++++++-----------
8 files changed, 61 insertions(+), 36 deletions(-)

--- freepgt2/arch/ia64/mm/hugetlbpage.c 2005-03-21 19:06:35.000000000 +0000
+++ freepgt3/arch/ia64/mm/hugetlbpage.c 2005-03-21 19:07:01.000000000 +0000
@@ -186,13 +186,30 @@ follow_huge_pmd(struct mm_struct *mm, un
return NULL;
}

-/*
- * Do nothing, until we've worked out what to do! To allow build, we
- * must remove reference to clear_page_range since it no longer exists.
- */
-void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
- unsigned long start, unsigned long end)
+void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
{
+ /*
+ * This is called only when is_hugepage_only_range(addr,),
+ * and it follows that is_hugepage_only_range(end,) also.
+ *
+ * The offset of these addresses from the base of the hugetlb
+ * region must be scaled down by HPAGE_SIZE/PAGE_SIZE so that
+ * the standard free_pgd_range will free the right page tables.
+ *
+ * If floor and ceiling are also in the hugetlb region, they
+ * must likewise be scaled down; but if outside, left unchanged.
+ */
+
+ addr = htlbpage_to_page(addr);
+ end = htlbpage_to_page(end);
+ if (is_hugepage_only_range(floor, HPAGE_SIZE))
+ floor = htlbpage_to_page(floor);
+ if (is_hugepage_only_range(ceiling, HPAGE_SIZE))
+ ceiling = htlbpage_to_page(ceiling);
+
+ free_pgd_range(tlb, addr, end, floor, ceiling);
}

void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
--- freepgt2/arch/ppc64/mm/hugetlbpage.c 2005-03-18 10:22:20.000000000 +0000
+++ freepgt3/arch/ppc64/mm/hugetlbpage.c 2005-03-21 19:07:01.000000000 +0000
@@ -430,16 +430,6 @@ void unmap_hugepage_range(struct vm_area
flush_tlb_pending();
}

-void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
- unsigned long start, unsigned long end)
-{
- /* Because the huge pgtables are only 2 level, they can take
- * at most around 4M, much less than one hugepage which the
- * process is presumably entitled to use. So we don't bother
- * freeing up the pagetables on unmap, and wait until
- * destroy_context() to clean up the lot. */
-}
-
int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
{
struct mm_struct *mm = current->mm;
--- freepgt2/include/asm-ia64/page.h 2005-03-02 07:38:52.000000000 +0000
+++ freepgt3/include/asm-ia64/page.h 2005-03-21 20:38:06.000000000 +0000
@@ -139,7 +139,7 @@ typedef union ia64_va {
# define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
# define is_hugepage_only_range(addr, len) \
(REGION_NUMBER(addr) == REGION_HPAGE && \
- REGION_NUMBER((addr)+(len)) == REGION_HPAGE)
+ REGION_NUMBER((addr)+(len)-1) == REGION_HPAGE)
extern unsigned int hpage_shift;
#endif

--- freepgt2/include/asm-ia64/pgtable.h 2005-03-19 14:50:08.000000000 +0000
+++ freepgt3/include/asm-ia64/pgtable.h 2005-03-21 19:07:01.000000000 +0000
@@ -463,8 +463,8 @@ extern struct page *zero_page_memmap_ptr
#define HUGETLB_PGDIR_SIZE (__IA64_UL(1) << HUGETLB_PGDIR_SHIFT)
#define HUGETLB_PGDIR_MASK (~(HUGETLB_PGDIR_SIZE-1))
struct mmu_gather;
-extern void hugetlb_free_pgtables(struct mmu_gather *tlb,
- struct vm_area_struct * prev, unsigned long start, unsigned long end);
+void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+ unsigned long end, unsigned long floor, unsigned long ceiling);
#endif

/*
--- freepgt2/include/asm-ppc64/pgtable.h 2005-03-18 10:22:41.000000000 +0000
+++ freepgt3/include/asm-ppc64/pgtable.h 2005-03-21 19:07:01.000000000 +0000
@@ -492,9 +492,15 @@ extern pgd_t ioremap_dir[1024];

extern void paging_init(void);

-struct mmu_gather;
-void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
- unsigned long start, unsigned long end);
+/*
+ * Because the huge pgtables are only 2 level, they can take
+ * at most around 4M, much less than one hugepage which the
+ * process is presumably entitled to use. So we don't bother
+ * freeing up the pagetables on unmap, and wait until
+ * destroy_context() to clean up the lot.
+ */
+#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
+ do { } while (0)

/*
* This gets called at the end of handling a page fault, when
--- freepgt2/include/linux/hugetlb.h 2004-08-14 06:39:00.000000000 +0100
+++ freepgt3/include/linux/hugetlb.h 2005-03-21 19:07:01.000000000 +0000
@@ -37,7 +37,8 @@ extern int sysctl_hugetlb_shm_group;

#ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
#define is_hugepage_only_range(addr, len) 0
-#define hugetlb_free_pgtables(tlb, prev, start, end) do { } while (0)
+#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
+ do { } while (0)
#endif

#ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE
@@ -72,7 +73,8 @@ static inline unsigned long hugetlb_tota
#define prepare_hugepage_range(addr, len) (-EINVAL)
#define pmd_huge(x) 0
#define is_hugepage_only_range(addr, len) 0
-#define hugetlb_free_pgtables(tlb, prev, start, end) do { } while (0)
+#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
+ do { } while (0)
#define alloc_huge_page() ({ NULL; })
#define free_huge_page(p) ({ (void)(p); BUG(); })

--- freepgt2/include/linux/mm.h 2005-03-21 19:06:48.000000000 +0000
+++ freepgt3/include/linux/mm.h 2005-03-21 19:07:01.000000000 +0000
@@ -586,7 +586,9 @@ unsigned long unmap_vmas(struct mmu_gath
struct vm_area_struct *start_vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *);
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+ unsigned long end, unsigned long floor, unsigned long ceiling);
+void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma);
--- freepgt2/mm/memory.c 2005-03-23 15:27:53.000000000 +0000
+++ freepgt3/mm/memory.c 2005-03-23 15:28:15.000000000 +0000
@@ -190,7 +190,7 @@ static inline void free_pud_range(struct
*
* Must be called with pagetable lock held.
*/
-static inline void free_pgd_range(struct mmu_gather *tlb,
+void free_pgd_range(struct mmu_gather **tlb,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
@@ -241,16 +241,16 @@ static inline void free_pgd_range(struct
return;

start = addr;
- pgd = pgd_offset(tlb->mm, addr);
+ pgd = pgd_offset((*tlb)->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- free_pud_range(tlb, pgd, addr, next, floor, ceiling);
+ free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);

- if (!tlb_is_full_mm(tlb))
- flush_tlb_pgtables(tlb->mm, start, end);
+ if (!tlb_is_full_mm(*tlb))
+ flush_tlb_pgtables((*tlb)->mm, start, end);
}

void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
@@ -260,13 +260,21 @@ void free_pgtables(struct mmu_gather **t
struct vm_area_struct *next = vma->vm_next;
unsigned long addr = vma->vm_start;

- /* Optimization: gather nearby vmas into a single call down */
- while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
- vma = next;
- next = vma->vm_next;
- }
- free_pgd_range(*tlb, addr, vma->vm_end,
+ if (is_hugepage_only_range(addr, HPAGE_SIZE)) {
+ hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
+ } else {
+ /*
+ * Optimization: gather nearby vmas into one call down
+ */
+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE
+ && !is_hugepage_only_range(next->vm_start, HPAGE_SIZE)){
+ vma = next;
+ next = vma->vm_next;
+ }
+ free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next? next->vm_start: ceiling);
+ }
vma = next;
}
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Hugh Dickins
2005-03-23 17:26:33 UTC
Permalink
Recent woes with some arches needing their own pgd_addr_end macro; and
4-level clear_page_range regression since 2.6.10's clear_page_tables;
and its long-standing well-known inefficiency in searching throughout
the higher-level page tables for those few entries to clear and free:
all can be blamed on ignoring the list of vmas when we free page tables.

Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the
same way, giving floor and ceiling beyond which it may not free tables.
This brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt
is enabled, in which case latency fixes spoil unmap_vmas throughput).

Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated,
and can only be freed while it is touched by some vma.

Move free_pgtables from mmap.c to memory.c, where its lower levels are
adapted from the clear_page_range levels. (Most of free_pgtables' old
code was actually for a non-existent case, prev not properly set up,
dating from before hch gave us split_vma.) Pass mmu_gather** in the
public interfaces, since we might want to add latency lockdrops later;
but no attempt to do so yet, going by vma should itself reduce latency.

But what if is_hugepage_only_range? Those ia64 and ppc64 cases need
careful examination: put that off until a later patch of the series.

What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?

And the range to sparc64's flush_tlb_pgtables? It's less clear to me
now that we need to do more than is done here - every PMD_SIZE ever
occupied will be flushed, do we really have to flush every PGDIR_SIZE
ever partially occupied? A shame to complicate it unnecessarily.

Special thanks to David Miller for time spent repairing my ceilings.

Signed-off-by: Hugh Dickins <***@veritas.com>
---

arch/i386/mm/pgtable.c | 2
arch/ia64/mm/hugetlbpage.c | 37 ----------
include/linux/mm.h | 3
mm/memory.c | 152 ++++++++++++++++++++++++++++++++++-----------
mm/mmap.c | 102 ++++++------------------------
5 files changed, 141 insertions(+), 155 deletions(-)

--- 2.6.12-rc1-bk1/arch/i386/mm/pgtable.c 2005-03-02 07:39:18.000000000 +0000
+++ freepgt1/arch/i386/mm/pgtable.c 2005-03-21 19:06:35.000000000 +0000
@@ -255,6 +255,6 @@ void pgd_free(pgd_t *pgd)
if (PTRS_PER_PMD > 1)
for (i = 0; i < USER_PTRS_PER_PGD; ++i)
kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
- /* in the non-PAE case, clear_page_range() clears user pgd entries */
+ /* in the non-PAE case, free_pgtables() clears user pgd entries */
kmem_cache_free(pgd_cache, pgd);
}
--- 2.6.12-rc1-bk1/arch/ia64/mm/hugetlbpage.c 2005-03-18 10:22:18.000000000 +0000
+++ freepgt1/arch/ia64/mm/hugetlbpage.c 2005-03-21 19:06:35.000000000 +0000
@@ -187,45 +187,12 @@ follow_huge_pmd(struct mm_struct *mm, un
}

/*
- * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
- * are hugetlb region specific.
+ * Do nothing, until we've worked out what to do! To allow build, we
+ * must remove reference to clear_page_range since it no longer exists.
*/
void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
- unsigned long first = start & HUGETLB_PGDIR_MASK;
- unsigned long last = end + HUGETLB_PGDIR_SIZE - 1;
- struct mm_struct *mm = tlb->mm;
-
- if (!prev) {
- prev = mm->mmap;
- if (!prev)
- goto no_mmaps;
- if (prev->vm_end > start) {
- if (last > prev->vm_start)
- last = prev->vm_start;
- goto no_mmaps;
- }
- }
- for (;;) {
- struct vm_area_struct *next = prev->vm_next;
-
- if (next) {
- if (next->vm_start < start) {
- prev = next;
- continue;
- }
- if (last > next->vm_start)
- last = next->vm_start;
- }
- if (prev->vm_end > first)
- first = prev->vm_end;
- break;
- }
-no_mmaps:
- if (last < first) /* for arches with discontiguous pgd indices */
- return;
- clear_page_range(tlb, first, last);
}

void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
--- 2.6.12-rc1-bk1/include/linux/mm.h 2005-03-18 10:22:43.000000000 +0000
+++ freepgt1/include/linux/mm.h 2005-03-21 19:06:35.000000000 +0000
@@ -591,7 +591,8 @@ int unmap_vmas(struct mmu_gather **tlbp,
struct vm_area_struct *start_vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *);
-void clear_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end);
+void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+ unsigned long floor, unsigned long ceiling);
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma);
int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
--- 2.6.12-rc1-bk1/mm/memory.c 2005-03-18 10:22:45.000000000 +0000
+++ freepgt1/mm/memory.c 2005-03-23 15:27:19.000000000 +0000
@@ -110,87 +110,165 @@ void pmd_clear_bad(pmd_t *pmd)
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
-static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
- unsigned long addr, unsigned long end)
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
{
- if (!((addr | end) & ~PMD_MASK)) {
- /* Only free fully aligned ranges */
- struct page *page = pmd_page(*pmd);
- pmd_clear(pmd);
- dec_page_state(nr_page_table_pages);
- tlb->mm->nr_ptes--;
- pte_free_tlb(tlb, page);
- }
+ struct page *page = pmd_page(*pmd);
+ pmd_clear(pmd);
+ pte_free_tlb(tlb, page);
+ dec_page_state(nr_page_table_pages);
+ tlb->mm->nr_ptes--;
}

-static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud,
- unsigned long addr, unsigned long end)
+static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
{
pmd_t *pmd;
unsigned long next;
- pmd_t *empty_pmd = NULL;
+ unsigned long start;

+ start = addr;
pmd = pmd_offset(pud, addr);
-
- /* Only free fully aligned ranges */
- if (!((addr | end) & ~PUD_MASK))
- empty_pmd = pmd;
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- clear_pte_range(tlb, pmd, addr, next);
+ free_pte_range(tlb, pmd);
} while (pmd++, addr = next, addr != end);

- if (empty_pmd) {
- pud_clear(pud);
- pmd_free_tlb(tlb, empty_pmd);
+ start &= PUD_MASK;
+ if (start < floor)
+ return;
+ if (ceiling) {
+ ceiling &= PUD_MASK;
+ if (!ceiling)
+ return;
}
+ if (end - 1 > ceiling - 1)
+ return;
+
+ pmd = pmd_offset(pud, start);
+ pud_clear(pud);
+ pmd_free_tlb(tlb, pmd);
}

-static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
- unsigned long addr, unsigned long end)
+static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
{
pud_t *pud;
unsigned long next;
- pud_t *empty_pud = NULL;
+ unsigned long start;

+ start = addr;
pud = pud_offset(pgd, addr);
-
- /* Only free fully aligned ranges */
- if (!((addr | end) & ~PGDIR_MASK))
- empty_pud = pud;
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- clear_pmd_range(tlb, pud, addr, next);
+ free_pmd_range(tlb, pud, addr, next, floor, ceiling);
} while (pud++, addr = next, addr != end);

- if (empty_pud) {
- pgd_clear(pgd);
- pud_free_tlb(tlb, empty_pud);
+ start &= PGDIR_MASK;
+ if (start < floor)
+ return;
+ if (ceiling) {
+ ceiling &= PGDIR_MASK;
+ if (!ceiling)
+ return;
}
+ if (end - 1 > ceiling - 1)
+ return;
+
+ pud = pud_offset(pgd, start);
+ pgd_clear(pgd);
+ pud_free_tlb(tlb, pud);
}

/*
- * This function clears user-level page tables of a process.
- * Unlike other pagetable walks, some memory layouts might give end 0.
+ * This function frees user-level page tables of a process.
+ *
* Must be called with pagetable lock held.
*/
-void clear_page_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end)
+static inline void free_pgd_range(struct mmu_gather *tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
{
pgd_t *pgd;
unsigned long next;
+ unsigned long start;

+ /*
+ * The next few lines have given us lots of grief...
+ *
+ * Why are we testing PMD* at this top level? Because often
+ * there will be no work to do at all, and we'd prefer not to
+ * go all the way down to the bottom just to discover that.
+ *
+ * Why all these "- 1"s? Because 0 represents both the bottom
+ * of the address space and the top of it (using -1 for the
+ * top wouldn't help much: the masks would do the wrong thing).
+ * The rule is that addr 0 and floor 0 refer to the bottom of
+ * the address space, but end 0 and ceiling 0 refer to the top
+ * Comparisons need to use "end - 1" and "ceiling - 1" (though
+ * that end 0 case should be mythical).
+ *
+ * Wherever addr is brought up or ceiling brought down, we must
+ * be careful to reject "the opposite 0" before it confuses the
+ * subsequent tests. But what about where end is brought down
+ * by PMD_SIZE below? no, end can't go down to 0 there.
+ *
+ * Whereas we round start (addr) and ceiling down, by different
+ * masks at different levels, in order to test whether a table
+ * now has no other vmas using it, so can be freed, we don't
+ * bother to round floor or end up - the tests don't need that.
+ */
+
+ addr &= PMD_MASK;
+ if (addr < floor) {
+ addr += PMD_SIZE;
+ if (!addr)
+ return;
+ }
+ if (ceiling) {
+ ceiling &= PMD_MASK;
+ if (!ceiling)
+ return;
+ }
+ if (end - 1 > ceiling - 1)
+ end -= PMD_SIZE;
+ if (addr > end - 1)
+ return;
+
+ start = addr;
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- clear_pud_range(tlb, pgd, addr, next);
+ free_pud_range(tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);
+
+ if (!tlb_is_full_mm(tlb))
+ flush_tlb_pgtables(tlb->mm, start, end);
+}
+
+void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+ unsigned long floor, unsigned long ceiling)
+{
+ while (vma) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long addr = vma->vm_start;
+
+ /* Optimization: gather nearby vmas into a single call down */
+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
+ vma = next;
+ next = vma->vm_next;
+ }
+ free_pgd_range(*tlb, addr, vma->vm_end,
+ floor, next? next->vm_start: ceiling);
+ vma = next;
+ }
}

pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
--- 2.6.12-rc1-bk1/mm/mmap.c 2005-03-18 10:22:45.000000000 +0000
+++ freepgt1/mm/mmap.c 2005-03-21 19:06:35.000000000 +0000
@@ -29,6 +29,10 @@
#include <asm/cacheflush.h>
#include <asm/tlb.h>

+static void unmap_region(struct mm_struct *mm,
+ struct vm_area_struct *vma, struct vm_area_struct *prev,
+ unsigned long start, unsigned long end);
+
/*
* WARNING: the debugging will use recursive algorithms so never enable this
* unless you know what you are doing.
@@ -1129,7 +1133,8 @@ unmap_and_free_vma:
fput(file);

/* Undo any partial mapping done by a device driver. */
- zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
+ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
+ charged = 0;
free_vma:
kmem_cache_free(vm_area_cachep, vma);
unacct_error:
@@ -1572,66 +1577,6 @@ find_extend_vma(struct mm_struct * mm, u
}
#endif

-/*
- * Try to free as many page directory entries as we can,
- * without having to work very hard at actually scanning
- * the page tables themselves.
- *
- * Right now we try to free page tables if we have a nice
- * PGDIR-aligned area that got free'd up. We could be more
- * granular if we want to, but this is fast and simple,
- * and covers the bad cases.
- *
- * "prev", if it exists, points to a vma before the one
- * we just free'd - but there's no telling how much before.
- */
-static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
- unsigned long start, unsigned long end)
-{
- unsigned long first = start & PGDIR_MASK;
- unsigned long last = end + PGDIR_SIZE - 1;
- struct mm_struct *mm = tlb->mm;
-
- if (last > MM_VM_SIZE(mm) || last < end)
- last = MM_VM_SIZE(mm);
-
- if (!prev) {
- prev = mm->mmap;
- if (!prev)
- goto no_mmaps;
- if (prev->vm_end > start) {
- if (last > prev->vm_start)
- last = prev->vm_start;
- goto no_mmaps;
- }
- }
- for (;;) {
- struct vm_area_struct *next = prev->vm_next;
-
- if (next) {
- if (next->vm_start < start) {
- prev = next;
- continue;
- }
- if (last > next->vm_start)
- last = next->vm_start;
- }
- if (prev->vm_end > first)
- first = prev->vm_end;
- break;
- }
-no_mmaps:
- if (last < first) /* for arches with discontiguous pgd indices */
- return;
- if (first < FIRST_USER_PGD_NR * PGDIR_SIZE)
- first = FIRST_USER_PGD_NR * PGDIR_SIZE;
- /* No point trying to free anything if we're in the same pte page */
- if ((first & PMD_MASK) < (last & PMD_MASK)) {
- clear_page_range(tlb, first, last);
- flush_tlb_pgtables(mm, first, last);
- }
-}
-
/* Normal function to fix up a mapping
* This function is the default for when an area has no specific
* function. This may be used as part of a more specific routine.
@@ -1674,24 +1619,22 @@ static void unmap_vma_list(struct mm_str
* Called with the page table lock held.
*/
static void unmap_region(struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct vm_area_struct *prev,
- unsigned long start,
- unsigned long end)
+ struct vm_area_struct *vma, struct vm_area_struct *prev,
+ unsigned long start, unsigned long end)
{
+ struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
struct mmu_gather *tlb;
unsigned long nr_accounted = 0;

lru_add_drain();
+ spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
-
- if (is_hugepage_only_range(start, end - start))
- hugetlb_free_pgtables(tlb, prev, start, end);
- else
- free_pgtables(tlb, prev, start, end);
+ free_pgtables(&tlb, vma, prev? prev->vm_end: 0,
+ next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
+ spin_unlock(&mm->page_table_lock);
}

/*
@@ -1823,9 +1766,7 @@ int do_munmap(struct mm_struct *mm, unsi
* Remove the vma's, and unmap the actual pages
*/
detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
- spin_lock(&mm->page_table_lock);
unmap_region(mm, mpnt, prev, start, end);
- spin_unlock(&mm->page_table_lock);

/* Fix up all other VM information */
unmap_vma_list(mm, mpnt);
@@ -1957,25 +1898,21 @@ EXPORT_SYMBOL(do_brk);
void exit_mmap(struct mm_struct *mm)
{
struct mmu_gather *tlb;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = mm->mmap;
unsigned long nr_accounted = 0;

lru_add_drain();

spin_lock(&mm->page_table_lock);

- tlb = tlb_gather_mmu(mm, 1);
flush_cache_mm(mm);
- /* Use ~0UL here to ensure all VMAs in the mm are unmapped */
- mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
- ~0UL, &nr_accounted, NULL);
+ tlb = tlb_gather_mmu(mm, 1);
+ /* Use -1 here to ensure all VMAs in the mm are unmapped */
+ mm->map_count -= unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- BUG_ON(mm->map_count); /* This is just debugging */
- clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm));
-
+ free_pgtables(&tlb, vma, 0, 0);
tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));

- vma = mm->mmap;
mm->mmap = mm->mmap_cache = NULL;
mm->mm_rb = RB_ROOT;
mm->rss = 0;
@@ -1993,6 +1930,9 @@ void exit_mmap(struct mm_struct *mm)
remove_vm_struct(vma);
vma = next;
}
+
+ BUG_ON(mm->map_count); /* This is just debugging */
+ BUG_ON(mm->nr_ptes); /* This is just debugging */
}

/* Insert vm structure into process list sorted by address
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Andi Kleen
2005-03-24 12:28:46 UTC
Permalink
Post by Hugh Dickins
Recent woes with some arches needing their own pgd_addr_end macro; and
4-level clear_page_range regression since 2.6.10's clear_page_tables;
and its long-standing well-known inefficiency in searching throughout
all can be blamed on ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the
same way, giving floor and ceiling beyond which it may not free tables.
This brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt
is enabled, in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated,
and can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are
adapted from the clear_page_range levels. (Most of free_pgtables' old
code was actually for a non-existent case, prev not properly set up,
dating from before hch gave us split_vma.) Pass mmu_gather** in the
public interfaces, since we might want to add latency lockdrops later;
but no attempt to do so yet, going by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need
careful examination: put that off until a later patch of the series.
Sorry for late answer. Nice approach.... It will not work as well
on large sparse mappings as the bit vectors, but that may be tolerable.
Post by Hugh Dickins
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
Everything. It could be easily changed though, but I was too lazy for
it so far. Do you think it is needed for your patch?

-Andi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Nick Piggin
2005-03-25 05:33:58 UTC
Permalink
Post by Hugh Dickins
And the range to sparc64's flush_tlb_pgtables? It's less clear to me
now that we need to do more than is done here - every PMD_SIZE ever
occupied will be flushed, do we really have to flush every PGDIR_SIZE
ever partially occupied? A shame to complicate it unnecessarily.
It looks like sparc64 is the only user of this, so it is up to
you Dave.

I don't think I'd be able to decipher how sparc64 implements this.

I think Hugh and I interpreted your message different ways.

So, to make the question more concrete: if a pgd_t is freed due
to freeing the single pmd_t contained within it (which was the
only part of the pgd's address space that contained a valid mapping)
Then do you need the full PGDIR_SIZE width passed to
flush_tlb_pgtables, or just the PMD_SIZE'd start,end that covered
the freed pmd_t?


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Nick Piggin
2005-03-25 05:37:25 UTC
Permalink
Post by Nick Piggin
Post by Hugh Dickins
And the range to sparc64's flush_tlb_pgtables? It's less clear to me
now that we need to do more than is done here - every PMD_SIZE ever
occupied will be flushed, do we really have to flush every PGDIR_SIZE
ever partially occupied? A shame to complicate it unnecessarily.
It looks like sparc64 is the only user of this, so it is up to
you Dave.
Oh - one other question too. Doing the unmap and page table freeing in
the same pass will put freed pagecache pages in the same mmu_gather as
the freed page table pages. This looks like it may be a problem for
sparc64?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
David S. Miller
2005-03-25 17:35:09 UTC
Permalink
On Fri, 25 Mar 2005 16:35:12 +1100
Post by Nick Piggin
Oh - one other question too. Doing the unmap and page table freeing in
the same pass will put freed pagecache pages in the same mmu_gather as
the freed page table pages. This looks like it may be a problem for
sparc64?
No, that's fine, the code can handle this situation.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
David S. Miller
2005-03-25 17:30:21 UTC
Permalink
On Fri, 25 Mar 2005 16:32:07 +1100
Post by Nick Piggin
So, to make the question more concrete: if a pgd_t is freed due
to freeing the single pmd_t contained within it (which was the
only part of the pgd's address space that contained a valid mapping)
Then do you need the full PGDIR_SIZE width passed to
flush_tlb_pgtables, or just the PMD_SIZE'd start,end that covered
the freed pmd_t?
Just the PMD_SIZE'd start,end is necessary.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
David S. Miller
2005-03-26 00:32:31 UTC
Permalink
On Fri, 25 Mar 2005 09:23:12 -0800
Post by David S. Miller
On Fri, 25 Mar 2005 16:32:07 +1100
Post by Nick Piggin
So, to make the question more concrete: if a pgd_t is freed due
to freeing the single pmd_t contained within it (which was the
only part of the pgd's address space that contained a valid mapping)
Then do you need the full PGDIR_SIZE width passed to
flush_tlb_pgtables, or just the PMD_SIZE'd start,end that covered
the freed pmd_t?
Just the PMD_SIZE'd start,end is necessary.
Since sparc64 is the only user of this thing...

Let's make it so that the flush can be queued up
at pmd_clear() time, as that's what we really want.

Something like:

pmd_clear(mm, vaddr, pmdp);

I'll try to play with something like this later.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Hugh Dickins
2005-03-23 17:26:33 UTC
Permalink
ia64 and sparc64 hurriedly had to introduce their own variants of
pgd_addr_end, to leapfrog over the holes in their virtual address spaces
which the final clear_page_range suddenly presented when converted from
pgd_index to pgd_addr_end. But now that free_pgtables respects the vma
list, those holes are never presented, and the arch variants can go.

Signed-off-by: Hugh Dickins <***@veritas.com>
---

include/asm-generic/pgtable.h | 8 +++-----
include/asm-ia64/pgtable.h | 26 --------------------------
include/asm-sparc64/pgtable.h | 15 ---------------
3 files changed, 3 insertions(+), 46 deletions(-)

--- freepgt3/include/asm-generic/pgtable.h 2005-03-18 10:22:40.000000000 +0000
+++ freepgt4/include/asm-generic/pgtable.h 2005-03-21 19:07:13.000000000 +0000
@@ -136,17 +136,15 @@ static inline void ptep_set_wrprotect(st
#endif

/*
- * When walking page tables, get the address of the next boundary, or
- * the end address of the range if that comes earlier. Although end might
- * wrap to 0 only in clear_page_range, __boundary may wrap to 0 throughout.
+ * When walking page tables, get the address of the next boundary,
+ * or the end address of the range if that comes earlier. Although no
+ * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
*/

-#ifndef pgd_addr_end
#define pgd_addr_end(addr, end) \
({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \
(__boundary - 1 < (end) - 1)? __boundary: (end); \
})
-#endif

#ifndef pud_addr_end
#define pud_addr_end(addr, end) \
--- freepgt3/include/asm-ia64/pgtable.h 2005-03-21 19:07:01.000000000 +0000
+++ freepgt4/include/asm-ia64/pgtable.h 2005-03-21 19:07:13.000000000 +0000
@@ -551,32 +551,6 @@ do { \
#define __HAVE_ARCH_PTE_SAME
#define __HAVE_ARCH_PGD_OFFSET_GATE

-/*
- * Override for pgd_addr_end() to deal with the virtual address space holes
- * in each region. In regions 0..4 virtual address bits are used like this:
- * +--------+------+--------+-----+-----+--------+
- * | pgdhi3 | rsvd | pgdlow | pmd | pte | offset |
- * +--------+------+--------+-----+-----+--------+
- * 'pgdlow' overflows to pgdhi3 (a.k.a. region bits) leaving rsvd==0
- */
-#define IA64_PGD_OVERFLOW (PGDIR_SIZE << (PAGE_SHIFT-6))
-
-#define pgd_addr_end(addr, end) \
-({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \
- if (REGION_NUMBER(__boundary) < 5 && \
- __boundary & IA64_PGD_OVERFLOW) \
- __boundary += (RGN_SIZE - 1) & ~(IA64_PGD_OVERFLOW - 1);\
- (__boundary - 1 < (end) - 1)? __boundary: (end); \
-})
-
-#define pmd_addr_end(addr, end) \
-({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \
- if (REGION_NUMBER(__boundary) < 5 && \
- __boundary & IA64_PGD_OVERFLOW) \
- __boundary += (RGN_SIZE - 1) & ~(IA64_PGD_OVERFLOW - 1);\
- (__boundary - 1 < (end) - 1)? __boundary: (end); \
-})
-
#include <asm-generic/pgtable-nopud.h>
#include <asm-generic/pgtable.h>

--- freepgt3/include/asm-sparc64/pgtable.h 2005-03-18 10:22:42.000000000 +0000
+++ freepgt4/include/asm-sparc64/pgtable.h 2005-03-21 19:07:13.000000000 +0000
@@ -432,21 +432,6 @@ extern int io_remap_page_range(struct vm
unsigned long offset,
unsigned long size, pgprot_t prot, int space);

-/* Override for {pgd,pmd}_addr_end() to deal with the virtual address
- * space hole. We simply sign extend bit 43.
- */
-#define pgd_addr_end(addr, end) \
-({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \
- __boundary = ((long) (__boundary << 20)) >> 20; \
- (__boundary - 1 < (end) - 1)? __boundary: (end); \
-})
-
-#define pmd_addr_end(addr, end) \
-({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \
- __boundary = ((long) (__boundary << 20)) >> 20; \
- (__boundary - 1 < (end) - 1)? __boundary: (end); \
-})
-
#include <asm-generic/pgtable.h>

/* We provide our own get_unmapped_area to cope with VA holes for userland */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Hugh Dickins
2005-03-23 17:23:47 UTC
Permalink
There's only one usage of MM_VM_SIZE(mm) left, and it's a troublesome
macro because mm doesn't contain the (32-bit emulation?) info needed.
But it too is only needed because we ignore the end from the vma list.

We could make flush_pgtables return that end, or unmap_vmas. Choose the
latter, since it's a natural fit with unmap_mapping_range_vma needing to
know its restart addr. This does make more than minimal change, but if
unmap_vmas had returned the end before, this is how we'd have done it,
rather than storing the break_addr in zap_details.

unmap_vmas used to return count of vmas scanned, but that's just debug
which hasn't been useful in a while; and if we want the map_count 0 on
exit check back, it can easily come from the final remove_vm_struct loop.

Signed-off-by: Hugh Dickins <***@veritas.com>
---

include/asm-ia64/processor.h | 8 --------
include/asm-ppc64/processor.h | 4 ----
include/asm-s390/processor.h | 2 --
include/linux/mm.h | 9 ++-------
mm/memory.c | 28 +++++++++++++---------------
mm/mmap.c | 6 +++---
6 files changed, 18 insertions(+), 39 deletions(-)

--- freepgt1/include/asm-ia64/processor.h 2004-12-24 21:36:47.000000000 +0000
+++ freepgt2/include/asm-ia64/processor.h 2005-03-21 19:06:48.000000000 +0000
@@ -43,14 +43,6 @@
#define TASK_SIZE (current->thread.task_size)

/*
- * MM_VM_SIZE(mm) gives the maximum address (plus 1) which may contain a mapping for
- * address-space MM. Note that with 32-bit tasks, this is still DEFAULT_TASK_SIZE,
- * because the kernel may have installed helper-mappings above TASK_SIZE. For example,
- * for x86 emulation, the LDT and GDT are mapped above TASK_SIZE.
- */
-#define MM_VM_SIZE(mm) DEFAULT_TASK_SIZE
-
-/*
* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
--- freepgt1/include/asm-ppc64/processor.h 2005-03-18 10:22:41.000000000 +0000
+++ freepgt2/include/asm-ppc64/processor.h 2005-03-21 19:06:48.000000000 +0000
@@ -542,10 +542,6 @@ extern struct task_struct *last_task_use
#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \
TASK_SIZE_USER32 : TASK_SIZE_USER64)

-/* We can't actually tell the TASK_SIZE given just the mm, but default
- * to the 64-bit case to make sure that enough gets cleaned up. */
-#define MM_VM_SIZE(mm) TASK_SIZE_USER64
-
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
--- freepgt1/include/asm-s390/processor.h 2004-10-18 22:56:29.000000000 +0100
+++ freepgt2/include/asm-s390/processor.h 2005-03-21 19:06:48.000000000 +0000
@@ -74,8 +74,6 @@ extern struct task_struct *last_task_use

#endif /* __s390x__ */

-#define MM_VM_SIZE(mm) DEFAULT_TASK_SIZE
-
#define HAVE_ARCH_PICK_MMAP_LAYOUT

typedef struct {
--- freepgt1/include/linux/mm.h 2005-03-21 19:06:35.000000000 +0000
+++ freepgt2/include/linux/mm.h 2005-03-21 19:06:48.000000000 +0000
@@ -37,10 +37,6 @@ extern int sysctl_legacy_va_layout;
#include <asm/processor.h>
#include <asm/atomic.h>

-#ifndef MM_VM_SIZE
-#define MM_VM_SIZE(mm) ((TASK_SIZE + PGDIR_SIZE - 1) & PGDIR_MASK)
-#endif
-
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))

/*
@@ -581,13 +577,12 @@ struct zap_details {
pgoff_t first_index; /* Lowest page->index to unmap */
pgoff_t last_index; /* Highest page->index to unmap */
spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */
- unsigned long break_addr; /* Where unmap_vmas stopped */
unsigned long truncate_count; /* Compare vm_truncate_count */
};

-void zap_page_range(struct vm_area_struct *vma, unsigned long address,
+unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *);
-int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm,
struct vm_area_struct *start_vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *);
--- freepgt1/mm/memory.c 2005-03-23 15:27:19.000000000 +0000
+++ freepgt2/mm/memory.c 2005-03-23 15:27:53.000000000 +0000
@@ -645,7 +645,7 @@ static void unmap_page_range(struct mmu_
* @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
* @details: details of nonlinear truncation or shared cache invalidation
*
- * Returns the number of vma's which were covered by the unmapping.
+ * Returns the end address of the unmapping (restart addr if interrupted).
*
* Unmap all pages in the vma list. Called under page_table_lock.
*
@@ -662,7 +662,7 @@ static void unmap_page_range(struct mmu_
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
@@ -670,12 +670,11 @@ int unmap_vmas(struct mmu_gather **tlbp,
unsigned long zap_bytes = ZAP_BLOCK_SIZE;
unsigned long tlb_start = 0; /* For tlb_finish_mmu */
int tlb_start_valid = 0;
- int ret = 0;
+ unsigned long start = start_addr;
spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
int fullmm = tlb_is_full_mm(*tlbp);

for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
- unsigned long start;
unsigned long end;

start = max(vma->vm_start, start_addr);
@@ -688,7 +687,6 @@ int unmap_vmas(struct mmu_gather **tlbp,
if (vma->vm_flags & VM_ACCOUNT)
*nr_accounted += (end - start) >> PAGE_SHIFT;

- ret++;
while (start != end) {
unsigned long block;

@@ -719,7 +717,6 @@ int unmap_vmas(struct mmu_gather **tlbp,
if (i_mmap_lock) {
/* must reset count of rss freed */
*tlbp = tlb_gather_mmu(mm, fullmm);
- details->break_addr = start;
goto out;
}
spin_unlock(&mm->page_table_lock);
@@ -733,7 +730,7 @@ int unmap_vmas(struct mmu_gather **tlbp,
}
}
out:
- return ret;
+ return start; /* which is now the end (or restart) address */
}

/**
@@ -743,7 +740,7 @@ out:
* @size: number of bytes to zap
* @details: details of nonlinear truncation or shared cache invalidation
*/
-void zap_page_range(struct vm_area_struct *vma, unsigned long address,
+unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
struct mm_struct *mm = vma->vm_mm;
@@ -753,15 +750,16 @@ void zap_page_range(struct vm_area_struc

if (is_vm_hugetlb_page(vma)) {
zap_hugepage_range(vma, address, size);
- return;
+ return end;
}

lru_add_drain();
spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
- unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
+ end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
tlb_finish_mmu(tlb, address, end);
spin_unlock(&mm->page_table_lock);
+ return end;
}

/*
@@ -1346,7 +1344,7 @@ no_new_page:
* i_mmap_lock.
*
* In order to make forward progress despite repeatedly restarting some
- * large vma, note the break_addr set by unmap_vmas when it breaks out:
+ * large vma, note the restart_addr from unmap_vmas when it breaks out:
* and restart from that address when we reach that vma again. It might
* have been split or merged, shrunk or extended, but never shifted: so
* restart_addr remains valid so long as it remains in the vma's range.
@@ -1384,8 +1382,8 @@ again:
}
}

- details->break_addr = end_addr;
- zap_page_range(vma, start_addr, end_addr - start_addr, details);
+ restart_addr = zap_page_range(vma, start_addr,
+ end_addr - start_addr, details);

/*
* We cannot rely on the break test in unmap_vmas:
@@ -1396,14 +1394,14 @@ again:
need_break = need_resched() ||
need_lockbreak(details->i_mmap_lock);

- if (details->break_addr >= end_addr) {
+ if (restart_addr >= end_addr) {
/* We have now completed this vma: mark it so */
vma->vm_truncate_count = details->truncate_count;
if (!need_break)
return 0;
} else {
/* Note restart_addr in vma's truncate_count field */
- vma->vm_truncate_count = details->break_addr;
+ vma->vm_truncate_count = restart_addr;
if (!need_break)
goto again;
}
--- freepgt1/mm/mmap.c 2005-03-21 19:06:35.000000000 +0000
+++ freepgt2/mm/mmap.c 2005-03-21 19:06:48.000000000 +0000
@@ -1900,6 +1900,7 @@ void exit_mmap(struct mm_struct *mm)
struct mmu_gather *tlb;
struct vm_area_struct *vma = mm->mmap;
unsigned long nr_accounted = 0;
+ unsigned long end;

lru_add_drain();

@@ -1908,10 +1909,10 @@ void exit_mmap(struct mm_struct *mm)
flush_cache_mm(mm);
tlb = tlb_gather_mmu(mm, 1);
/* Use -1 here to ensure all VMAs in the mm are unmapped */
- mm->map_count -= unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
+ end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, 0, 0);
- tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
+ tlb_finish_mmu(tlb, 0, end);

mm->mmap = mm->mmap_cache = NULL;
mm->mm_rb = RB_ROOT;
@@ -1931,7 +1932,6 @@ void exit_mmap(struct mm_struct *mm)
vma = next;
}

- BUG_ON(mm->map_count); /* This is just debugging */
BUG_ON(mm->nr_ptes); /* This is just debugging */
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Luck, Tony
2005-03-23 19:19:07 UTC
Permalink
+ * Why all these "- 1"s? Because 0 represents both the bottom
+ * of the address space and the top of it (using -1 for the
+ * top wouldn't help much: the masks would do the wrong thing).
+ * The rule is that addr 0 and floor 0 refer to the bottom of
+ * the address space, but end 0 and ceiling 0 refer to the top
+ * Comparisons need to use "end - 1" and "ceiling - 1" (though
+ * that end 0 case should be mythical).

Can we legislate that "end==0" isn't possible. On ia64 this is
certainly true (user virtual space is confined to regions 0-4, so
the max value of end is 0xa000000000000000[*]). Same applies on x86
where the max user address is 0xc0000000 (assuming a standard 3G/1G
split, and ignoring the 4G-4G patch). What do the other architectures
do? Does anyone allow:
mmap((void *)-PAGE_SIZE, PAGE_SIZE, MAP_FIXED, ...)
to succeed?

Otherwise throw in some extra macros to hide the computation needed
to make the masks work on ceiling values that represent the last byte
of the vma rather than the address beyond. Presumably we can use a
few cycles on some extra arithmetic to help us save gazillions of
cycles for all the cache misses that we currently expend traversing
empty areas of page tables at each level.

Latest patches run on ia64 ... I did at least throw a fork-tester at
it to make sure that we aren't leaking pagetables ... it is still
running after a few million forks, so the simple cases are not doing
anything completely bogus.

-Tony

[*] Since a three level page table doesn't give us enough bits, the
actual limit (with a 16k page size) is 0x8000100000000000 with a hole
for the rest of region 4).

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
David S. Miller
2005-03-23 19:24:28 UTC
Permalink
On Wed, 23 Mar 2005 11:16:27 -0800
Post by Luck, Tony
Can we legislate that "end==0" isn't possible.
It is possible with some out-of-tree patches.

I could certainly support it on sparc64, the only
glitch is that this is where the virtually mapped
linear page tables sit right now :-)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Paul Mackerras
2005-03-23 22:08:38 UTC
Permalink
Post by Luck, Tony
Can we legislate that "end==0" isn't possible.
I think this is only likely to be a problem on 32-bit platforms with
hardware support for separate user and kernel address spaces. m68k
and sparc32 come to mind, though I might be mistaken.

Paul.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Andreas Schwab
2005-03-23 22:24:01 UTC
Permalink
Post by Paul Mackerras
Post by Luck, Tony
Can we legislate that "end==0" isn't possible.
I think this is only likely to be a problem on 32-bit platforms with
hardware support for separate user and kernel address spaces. m68k
and sparc32 come to mind, though I might be mistaken.
On m68k we don't allow addresses above 0xF0000000.

Andreas.
--
Andreas Schwab, SuSE Labs, ***@suse.de
SuSE Linux Products GmbH, Maxfeldstraße 5, 90409 Nürnberg, Germany
Key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5
"And now for something completely different."
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
David S. Miller
2005-03-23 20:02:06 UTC
Permalink
On Wed, 23 Mar 2005 17:10:15 +0000 (GMT)
Here's the recut of those patches, including David Miller's vital fixes.
I'm addressing these to Nick rather than Andrew, because they're perhaps
not fit for -mm until more testing done and the x86_64 32-bit vdso issue
handled. I'm unlikely to be responsive until next week, sorry: over to
you, Nick - thanks.
Works perfectly fine on sparc64.

BTW, I note that we may still want something like that page table
bitmask stuff I worked on some time ago. Ie. for things like
what lat_mmap does in lmbench, I think that situation is more
realistic than people might thing.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
David S. Miller
2005-03-24 05:49:23 UTC
Permalink
On Thu, 24 Mar 2005 11:26:16 +1100
OK, attached is my first cut at slimming down the boundary tests.
Works perfectly fine on sparc64.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Luck, Tony
2005-03-24 01:03:38 UTC
Permalink
OK, attached is my first cut at slimming down the boundary tests.
I have only had a chance to try it on i386, so I hate to drop it
on you like this - but I *have* put a bit of thought into it....
Treat it as an RFC, and I'll try to test it on a wider range of
things in the next couple of days.
Not that there is anything really nasty with your system David,
so I don't think it will be a big disaster if I can't get this to
work.
Goes on top of Hugh's 6 patches.
Runs on ia64. Looks much cleaner too.

-Tony
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Nick Piggin
2005-03-24 01:10:02 UTC
Permalink
Post by Luck, Tony
OK, attached is my first cut at slimming down the boundary tests.
I have only had a chance to try it on i386, so I hate to drop it
on you like this - but I *have* put a bit of thought into it....
Treat it as an RFC, and I'll try to test it on a wider range of
things in the next couple of days.
Not that there is anything really nasty with your system David,
so I don't think it will be a big disaster if I can't get this to
work.
Goes on top of Hugh's 6 patches.
Runs on ia64. Looks much cleaner too.
Oh good. Thanks for testing, Tony.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Russell King
2005-03-25 21:27:05 UTC
Permalink
Here's the recut of those patches, including David Miller's vital fixes.
I'm addressing these to Nick rather than Andrew, because they're perhaps
not fit for -mm until more testing done and the x86_64 32-bit vdso issue
handled. I'm unlikely to be responsive until next week, sorry: over to
you, Nick - thanks.
I thought I'd try these out on ARM, but alas they don't apply to
2.6.12-rc1. ;( This means I won't be testing them, sorry.
--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of: 2.6 Serial core
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Nick Piggin
2005-03-26 02:09:14 UTC
Permalink
Post by Russell King
Here's the recut of those patches, including David Miller's vital fixes.
I'm addressing these to Nick rather than Andrew, because they're perhaps
not fit for -mm until more testing done and the x86_64 32-bit vdso issue
handled. I'm unlikely to be responsive until next week, sorry: over to
you, Nick - thanks.
I thought I'd try these out on ARM, but alas they don't apply to
2.6.12-rc1. ;( This means I won't be testing them, sorry.
The reject should be confined to include/asm-ia64, so it will still
work for you.

But I've put a clean rollup of all Hugh's patches here in case you'd
like to try it.

http://www.kerneltrap.org/~npiggin/freepgt-2.6.12-rc1.patch

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Russell King
2005-03-26 11:40:18 UTC
Permalink
Post by Nick Piggin
The reject should be confined to include/asm-ia64, so it will still
work for you.
I guess I should've tried a little harder last night then. Sorry.
Post by Nick Piggin
But I've put a clean rollup of all Hugh's patches here in case you'd
like to try it.
http://www.kerneltrap.org/~npiggin/freepgt-2.6.12-rc1.patch
This works fine on ARM with high vectors. With low vectors (located in
the 1st page of virtual memory space) I get:

kernel BUG at mm/mmap.c:1934!
Unable to handle kernel NULL pointer dereference at virtual address 00000000
pgd = c1e88000
[00000000] *pgd=c1e86031, *pte=c04440cf, *ppte=c044400e
Internal error: Oops: c1e8981f [#1]
Modules linked in:
CPU: 0
PC is at __bug+0x40/0x54
LR is at 0x1
pc : [<c0223870>] lr : [<00000001>] Not tainted
sp : c1e7bd18 ip : 60000093 fp : c1e7bd28
r10: c1f4b040 r9 : 00000006 r8 : c1f02ca0
r7 : 00000000 r6 : 00000000 r5 : c015b8c0 r4 : 00000000
r3 : 00000000 r2 : 00000000 r1 : 00000d4e r0 : 00000001
Flags: nZCv IRQs on FIQs on Mode SVC_32 Segment user
Control: C1E8917F Table: C1E8917F DAC: 00000015
Process init (pid: 235, stack limit = 0xc1e7a194)
Stack: (0xc1e7bd18 to 0xc1e7c000)
...
Backtrace:
[<c0223830>] (__bug+0x0/0x54) from [<c02691d8>] (exit_mmap+0x154/0x168)
r4 = C1E7BD3C
[<c0269084>] (exit_mmap+0x0/0x168) from [<c02358c8>] (mmput+0x40/0xc0)
r7 = C015B8C0 r6 = C015B8C0 r5 = 00000000 r4 = C015B8C0
[<c0235888>] (mmput+0x0/0xc0) from [<c027ecec>] (exec_mmap+0xec/0x134)
r4 = C015B6A0
[<c027ec00>] (exec_mmap+0x0/0x134) from [<c027f234>] (flush_old_exec+0x4c8/0x6e4)
r7 = C012B940 r6 = C1E7A000 r5 = C0498A00 r4 = 00000000
[<c027ed6c>] (flush_old_exec+0x0/0x6e4) from [<c029d53c>] (load_elf_binary+0x5c0/0xdc0)
[<c029cf7c>] (load_elf_binary+0x0/0xdc0) from [<c027f6e0>] (search_binary_handler+0xa0/0x244)
[<c027f640>] (search_binary_handler+0x0/0x244) from [<c029c4e8>] (load_script+0x224/0x22c)
[<c029c2c4>] (load_script+0x0/0x22c) from [<c027f6e0>] (search_binary_handler+0xa0/0x244)
r6 = C1E7A000 r5 = C015E400 r4 = C03EC2B4
[<c027f640>] (search_binary_handler+0x0/0x244) from [<c027f9b8>] (do_execve+0x134/0x1f8)
[<c027f884>] (do_execve+0x0/0x1f8) from [<c02223f8>] (sys_execve+0x3c/0x5c)
[<c02223bc>] (sys_execve+0x0/0x5c) from [<c021dca0>] (ret_fast_syscall+0x0/0x2c)
r7 = 0000000B r6 = BED6AA74 r5 = BED6AB00 r4 = 0200F008
Code: 1b0051b8 e59f0014 eb0051b6 e3a03000 (e5833000)

In this case, we have a page which must be kept mapped at virtual address
0, which means the first entry in the L1 page table must always exist.
However, user threads start from 0x8000, which is also mapped via the
first entry in the L1 page table.

At a guess, I'd imagine that we're freeing the first L1 page table entry,
thereby causing mm->nr_ptes to become -1.

I'll do some debugging and try to work out if that (or exactly what's)
going on.
--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of: 2.6 Serial core
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Russell King
2005-03-26 13:39:26 UTC
Permalink
Post by Russell King
In this case, we have a page which must be kept mapped at virtual address
0, which means the first entry in the L1 page table must always exist.
However, user threads start from 0x8000, which is also mapped via the
first entry in the L1 page table.
At a guess, I'd imagine that we're freeing the first L1 page table entry,
thereby causing mm->nr_ptes to become -1.
I'll do some debugging and try to work out if that (or exactly what's)
going on.
Ok. What's happening is that the ARM pgd_alloc uses pte_alloc_map() to
allocate the first L1 page table. This sets mm->nr_ptes to be one.

The ARM free_pgd knows about this, and will free this PTE at the
appropriate time. However, exit_mmap() doesn't know about this itself,
so in the ARM case, it should BUG_ON(mm->nr_ptes != 1) if we're using
low vectors.

I guess we could hack it such that the ARM pgd_alloc decrements mm->nr_ptes
itself to keep things balanced, since free_pte_range() won't be called
for this pte. However, I don't like that because its likely to break at
some point in the future.

Any ideas how to cleanly support this with the new infrastructure?
--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of: 2.6 Serial core

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Nick Piggin
2005-03-26 13:52:56 UTC
Permalink
Post by Russell King
Ok. What's happening is that the ARM pgd_alloc uses pte_alloc_map() to
allocate the first L1 page table. This sets mm->nr_ptes to be one.
The ARM free_pgd knows about this, and will free this PTE at the
appropriate time. However, exit_mmap() doesn't know about this itself,
so in the ARM case, it should BUG_ON(mm->nr_ptes != 1) if we're using
low vectors.
I guess we could hack it such that the ARM pgd_alloc decrements mm->nr_ptes
itself to keep things balanced, since free_pte_range() won't be called
for this pte. However, I don't like that because its likely to break at
some point in the future.
Any ideas how to cleanly support this with the new infrastructure?
Hmm, in that case it could be just a problem with that BUG_ON() -
it wasn't there before... but it seems like a very useful test,
especially with all this new work going on, so it would be a shame
not to run it in releases.

But I don't quite understand (should really look at the code more),
how come you aren't leaking memory? Do _all_ processes share this
same first L1 page table? If so, can it be allocated with a more
specialised function? If not, can nr_ptes be decremented in the
place where it is freed?

/me goes to bed now - I'll have a bit of a look tomorrow.

Nick

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Russell King
2005-03-26 15:06:07 UTC
Permalink
Post by Nick Piggin
Hmm, in that case it could be just a problem with that BUG_ON() -
it wasn't there before... but it seems like a very useful test,
especially with all this new work going on, so it would be a shame
not to run it in releases.
Indeed.
Post by Nick Piggin
But I don't quite understand (should really look at the code more),
how come you aren't leaking memory?
The ARM free_pgd_slow() knows about this special first L1 page table, and
knows to free it if it exists when its called.
Post by Nick Piggin
Do _all_ processes share this same first L1 page table?
No. It has to be specific to each process. Each L1 page table entry
covers 2MB, but executables start at virtual 0x8000.

I guess we could open-code pte_alloc_map() in get_pgd_slow() which
could solve this problem by omitting the mm->nr_ptes accounting; it
may also reduce the complexity as well.

I'm just slightly concerned that this may be rather fragile in terms
of future development.
--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of: 2.6 Serial core
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Russell King
2005-03-26 15:55:10 UTC
Permalink
OK, thanks that would be good. You could well be right in your analysis.
Yes, this patch seems to also be required, otherwise I see:

free_pgtables : floor 40015000 ceiling 4001b000
free_pgd_range: floor 40015000 ceiling 4001b000 addr 40015000 end 40016000
free_pgtables : floor 40015000 ceiling 4001b000
free_pgd_range: floor 40015000 ceiling 4001b000 addr 40015000 end 40016000
free_pgtables : floor 40015000 ceiling 4001b000
free_pgd_range: floor 40015000 ceiling 4001b000 addr 40015000 end 40016000
free_pgtables : floor 00000000 ceiling 00000000
free_pgd_range: floor 00000000 ceiling 40000000 addr 00008000 end 0001d000
free_pud_range: floor 00000000 ceiling 40000000 addr 00000000 end 00200000
free_pmd_range: floor 00000000 ceiling 40000000 addr 00000000 end 00200000
free_pgd_range: floor 00000000 ceiling bef4e000 addr 40000000 end 4012d000
free_pud_range: floor 00000000 ceiling bef4e000 addr 40000000 end 40200000
free_pmd_range: floor 00000000 ceiling bef4e000 addr 40000000 end 40200000
free_pgd_range: floor 00000000 ceiling 00000000 addr bef4e000 end bef63000
free_pud_range: floor 00000000 ceiling 00000000 addr bee00000 end bf000000
free_pmd_range: floor 00000000 ceiling 00000000 addr bee00000 end bf000000
exit_mmap: nr_ptes -1

The above is with my fix to ARMs get_pgd_slow, which shows that we
accidentally freed the first entry in the L1 page table. With my
fix and your patch, low-vectored ARMs work again.

I don't think it'll be invasive to push my get_pgd_slow() fix before
these freepgt patches appear. For the record, this is the patch I'm
using at present. With a bit more effort, I could probably eliminate
pmd_alloc (and therefore the unnecessary spinlocking) here.

diff -up -x BitKeeper -x ChangeSet -x SCCS -x _xlk -x *.orig -x *.rej orig/arch/arm/mm/mm-armv.c linux/arch/arm/mm/mm-armv.c
--- orig/arch/arm/mm/mm-armv.c Sat Mar 19 11:20:01 2005
+++ linux/arch/arm/mm/mm-armv.c Sat Mar 26 15:51:57 2005
@@ -160,6 +160,8 @@ pgd_t *get_pgd_slow(struct mm_struct *mm
init_pgd = pgd_offset_k(0);

if (!vectors_high()) {
+ struct page *new;
+
/*
* This lock is here just to satisfy pmd_alloc and pte_lock
*/
@@ -170,12 +172,16 @@ pgd_t *get_pgd_slow(struct mm_struct *mm
* contains the machine vectors.
*/
new_pmd = pmd_alloc(mm, new_pgd, 0);
+ spin_unlock(&mm->page_table_lock);
if (!new_pmd)
goto no_pmd;

- new_pte = pte_alloc_map(mm, new_pmd, 0);
- if (!new_pte)
+ new = pte_alloc_one(mm, 0);
+ if (!new)
goto no_pte;
+ inc_page_state(nr_page_table_pages);
+ pmd_populate(mm, new_pmd, new);
+ new_pte = pte_offset_map(new_pmd, 0);

init_pmd = pmd_offset(init_pgd, 0);
init_pte = pte_offset_map_nested(init_pmd, 0);
@@ -197,16 +203,9 @@ pgd_t *get_pgd_slow(struct mm_struct *mm
return new_pgd;

no_pte:
- spin_unlock(&mm->page_table_lock);
pmd_free(new_pmd);
- free_pages((unsigned long)new_pgd, 2);
- return NULL;
-
no_pmd:
- spin_unlock(&mm->page_table_lock);
free_pages((unsigned long)new_pgd, 2);
- return NULL;
-
no_pgd:
return NULL;
}
--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of: 2.6 Serial core
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Nick Piggin
2005-03-27 03:43:39 UTC
Permalink
Post by Russell King
OK, thanks that would be good. You could well be right in your analysis.
[...]

OK.
Post by Russell King
The above is with my fix to ARMs get_pgd_slow, which shows that we
accidentally freed the first entry in the L1 page table. With my
fix and your patch, low-vectored ARMs work again.
I don't think it'll be invasive to push my get_pgd_slow() fix before
these freepgt patches appear. For the record, this is the patch I'm
using at present. With a bit more effort, I could probably eliminate
pmd_alloc (and therefore the unnecessary spinlocking) here.
Seems OK if you're happy with it. Is this going to leak
"nr_page_table_pages" too, though?

Hmm... no, because free_pgd_slow decrements it? In that case, can
you have free_pgd_slow also decrement nr_ptes, instead of your
below patch?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Russell King
2005-03-27 07:59:32 UTC
Permalink
Post by Nick Piggin
Hmm... no, because free_pgd_slow decrements it? In that case, can
you have free_pgd_slow also decrement nr_ptes, instead of your
below patch?
Unfortunately not - free_pgd_slow doesn't have any knowledge about the
mm_struct that the pgd was associated with.

Also remember that nr_ptes wasn't incremented in get_pgd_slow() for this
page table either, and this is the "bug" fix.
--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of: 2.6 Serial core
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
David S. Miller
2005-03-27 18:23:32 UTC
Permalink
On Sun, 27 Mar 2005 08:57:25 +0100
Post by Russell King
Unfortunately not - free_pgd_slow doesn't have any knowledge about the
mm_struct that the pgd was associated with.
You could store the mm pointer in the page struct of the
pgd, we used to that before set_pte_at() existed on
sparc64 and ppc64 for pte level tables.

page->mapping and page->index are basically free game for
tracking information assosciated with page table chunks.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Russell King
2005-03-28 07:55:08 UTC
Permalink
Post by David S. Miller
On Sun, 27 Mar 2005 08:57:25 +0100
Post by Russell King
Unfortunately not - free_pgd_slow doesn't have any knowledge about the
mm_struct that the pgd was associated with.
You could store the mm pointer in the page struct of the
pgd, we used to that before set_pte_at() existed on
sparc64 and ppc64 for pte level tables.
page->mapping and page->index are basically free game for
tracking information assosciated with page table chunks.
Why would I want to do this, given that decrementing mm->nr_ptes in
free_pgd_slow() would make it negative ? Am I missing something
obvious?
--
Russell King
Linux kernel 2.6 ARM Linux - http://www.arm.linux.org.uk/
maintainer of: 2.6 Serial core
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
David S. Miller
2005-03-28 18:51:45 UTC
Permalink
On Mon, 28 Mar 2005 08:51:36 +0100
Post by Russell King
Why would I want to do this, given that decrementing mm->nr_ptes in
free_pgd_slow() would make it negative ? Am I missing something
obvious?
You were saying there was no way to figure out which mm is
assosociate a particular pgd, and I'm merely showing you
how you can in fact do it. :-)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



-------------------------------------------------------------------------------
Achtung: diese Newsgruppe ist eine unidirektional gegatete Mailingliste.
Antworten nur per Mail an die im Reply-To-Header angegebene Adresse.
Fragen zum Gateway -> ***@inka.de.
-------------------------------------------------------------------------------
Loading...