From: Vijaya Kumar K <Vijaya.Kumar(a)caviumnetworks.com>
Based on the step-handler and break-handler hooks patch from
Sandeepa (Patch 1), KGDB debugging support is added for EL1
debug in AArch64 mode. Any updates that come for Patch 1 from
Sandeepa will be rebased in next version
With first patch,register layout is updated to be inline with GDB tool.
Basic GDB connection, break point set/clear and info commands
are supported except step/next debugging
With second patch, step/next debugging support is added, where in
pc is updated to point to the instruction to be stepped and
stopped.
v2:
- Moved break instruction encoding to debug-monitors.h file
- Fixed endianess of compile break instruction encoding
- Updated I/O buffer sizes
- Updated register buffer size
- Remove changes to debug_exception handler in entry.S for
- ELR update and step debugging with update pc instead of ELR
- Rebased against AArch64 upstream kernel
v1:
- Initial patch-set
Tested with Aarch64 GDB tool chain on simulator
Sandeepa Prabhu (1):
AArch64: Add single-step and breakpoint handler hooks
Vijaya Kumar K (2):
AArch64: KGDB: Add Basic KGDB support
AArch64: KGDB: Add step debugging support
arch/arm64/include/asm/debug-monitors.h | 30 +++
arch/arm64/include/asm/kgdb.h | 81 ++++++++
arch/arm64/kernel/Makefile | 1 +
arch/arm64/kernel/debug-monitors.c | 95 ++++++++-
arch/arm64/kernel/entry.S | 6 +-
arch/arm64/kernel/kgdb.c | 341 +++++++++++++++++++++++++++++++
6 files changed, 550 insertions(+), 4 deletions(-)
create mode 100644 arch/arm64/include/asm/kgdb.h
create mode 100644 arch/arm64/kernel/kgdb.c
--
1.7.9.5
The memory pinning code in uaccess_with_memcpy.c does not check
for HugeTLB or THP pmds, and will enter an infinite loop should
a __copy_to_user or __clear_user occur against a huge page.
This patch adds detection code for huge pages to pin_page_for_write.
As this code can be executed in a fast path it refers to the actual
pmds rather than the vma. If a HugeTLB or THP is found (they have
the same pmd representation on ARM), the page table spinlock is
taken to prevent modification whilst the page is pinned.
On ARM, huge pages are only represented as pmds, thus no huge pud
checks are performed. (For huge puds one would lock the page table
in a similar manner as in the pmd case).
Two helper functions are introduced; pmd_thp_or_huge will check
whether or not a page is huge or transparent huge (which have the
same pmd layout on ARM), and pmd_hugewillfault will detect whether
or not a page fault will occur on write to the page.
Running the following test (with the chunking from read_zero
removed):
$ dd if=/dev/zero of=/dev/null bs=10M count=1024
Gave: 2.3 GB/s backed by normal pages,
2.9 GB/s backed by huge pages,
5.1 GB/s backed by huge pages, with page mask=HPAGE_MASK.
After some discussion, it was decided not to adopt the HPAGE_MASK,
as this would have a significant detrimental effect on the overall
system latency due to page_table_lock being held for too long.
This could be revisited if split huge page locks are adopted.
Signed-off-by: Steve Capper <steve.capper(a)linaro.org>
Reviewed-by: Nicolas Pitre <nico(a)linaro.org>
---
arch/arm/include/asm/pgtable-3level.h | 3 +++
arch/arm/lib/uaccess_with_memcpy.c | 41 ++++++++++++++++++++++++++++++++---
2 files changed, 41 insertions(+), 3 deletions(-)
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 5689c18..39c54cf 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -206,6 +206,9 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
#define __HAVE_ARCH_PMD_WRITE
#define pmd_write(pmd) (!(pmd_val(pmd) & PMD_SECT_RDONLY))
+#define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd))
+#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd))
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
#define pmd_trans_splitting(pmd) (pmd_val(pmd) & PMD_SECT_SPLITTING)
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 025f742..3e58d71 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -18,6 +18,7 @@
#include <linux/hardirq.h> /* for in_atomic() */
#include <linux/gfp.h>
#include <linux/highmem.h>
+#include <linux/hugetlb.h>
#include <asm/current.h>
#include <asm/page.h>
@@ -40,7 +41,35 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
return 0;
pmd = pmd_offset(pud, addr);
- if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
+ if (unlikely(pmd_none(*pmd)))
+ return 0;
+
+ /*
+ * A pmd can be bad if it refers to a HugeTLB or THP page.
+ *
+ * Both THP and HugeTLB pages have the same pmd layout
+ * and should not be manipulated by the pte functions.
+ *
+ * Lock the page table for the destination and check
+ * to see that it's still huge and whether or not we will
+ * need to fault on write, or if we have a splitting THP.
+ */
+ if (unlikely(pmd_thp_or_huge(*pmd))) {
+ ptl = ¤t->mm->page_table_lock;
+ spin_lock(ptl);
+ if (unlikely(!pmd_thp_or_huge(*pmd)
+ || pmd_hugewillfault(*pmd)
+ || pmd_trans_splitting(*pmd))) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ *ptep = NULL;
+ *ptlp = ptl;
+ return 1;
+ }
+
+ if (unlikely(pmd_bad(*pmd)))
return 0;
pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
@@ -94,7 +123,10 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
from += tocopy;
n -= tocopy;
- pte_unmap_unlock(pte, ptl);
+ if (pte)
+ pte_unmap_unlock(pte, ptl);
+ else
+ spin_unlock(ptl);
}
if (!atomic)
up_read(¤t->mm->mmap_sem);
@@ -147,7 +179,10 @@ __clear_user_memset(void __user *addr, unsigned long n)
addr += tocopy;
n -= tocopy;
- pte_unmap_unlock(pte, ptl);
+ if (pte)
+ pte_unmap_unlock(pte, ptl);
+ else
+ spin_unlock(ptl);
}
up_read(¤t->mm->mmap_sem);
--
1.8.1.4
In Thumb2 kernel (CONFIG_THUMB2_KERNEL) kexec's relocate code is assembled
in Thumb2 mode, but cpu_v7_reset() jumps to this code in ARM state,
because its address is page aligned and has 0 in LSB.
Assemble this code in ARM mode to fix the issue.
Signed-off-by: Taras Kondratiuk <taras.kondratiuk(a)linaro.org>
---
Based on v3.12-rc4
Cc: Dave Martin <dave.martin(a)linaro.org>
Cc: Will Deacon <will.deacon(a)arm.com>
Cc: Russell King <linux(a)arm.linux.org.uk>
Cc: linaro-kernel(a)lists.linaro.org
Cc: linux-arm-kernel(a)lists.infradead.org
---
arch/arm/kernel/relocate_kernel.S | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm/kernel/relocate_kernel.S b/arch/arm/kernel/relocate_kernel.S
index d0cdedf..a3af323 100644
--- a/arch/arm/kernel/relocate_kernel.S
+++ b/arch/arm/kernel/relocate_kernel.S
@@ -5,6 +5,7 @@
#include <asm/kexec.h>
.globl relocate_new_kernel
+ .arm
relocate_new_kernel:
ldr r0,kexec_indirection_page
--
1.7.9.5
The coherant DMA allocator code contained a compile time warning
when HugeTLB support was enabled. It stated that huge pages were
not supported by the DMA allocator.
Apart from memory pressure, HugeTLB should not affect (or be
affected by) the higher order pages operated on by the DMA
allocator. Also, the user space mappings returned by arm_dma_mmap
are done via remap_pfn_range, so the Transparent Huge Page daemon
will leave them alone too.
This patch removes the huge page warning from dma-mapping.c.
Signed-off-by: Steve Capper <steve.capper(a)linaro.org>
---
Hi, I'm resending this patch as it appears to have slipped through the
cracks. Without this patch we will get spurious compiler warnings when
building kernels with huge page support.
Cheers,
--
Steve
---
arch/arm/mm/dma-mapping.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 7f9b179..9486048 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -249,9 +249,6 @@ static void __dma_free_buffer(struct page *page, size_t size)
}
#ifdef CONFIG_MMU
-#ifdef CONFIG_HUGETLB_PAGE
-#warning ARM Coherent DMA allocator does not (yet) support huge TLB
-#endif
static void *__alloc_from_contiguous(struct device *dev, size_t size,
pgprot_t prot, struct page **ret_page,
--
1.8.1.4
Hi Will, Ben, Russell, Thomas,
Please review second version of patch that fixes TLB asid issue in big endian
V7 image.
Changes from v1:
Note previous patch subject line was 'ARM: tlb:
__flush_tlb_mm need to use int asid var for BE correct operation'
Added 'unsigned int' cast into ASID macro itself rather
then use intermediate 'int' variable in __flush_tlb_mm function.
This is done per v1 patch discussion at
http://lists.infradead.org/pipermail/linux-arm-kernel/2013-October/202583.h…
Tested with Linaro BE topic branch on Arndale board. Both LE and BE
images were tested.
Thanks,
Victor
Victor Kamensky (1):
ARM: tlb: ASID macro should give 32bit result for BE correct operation
arch/arm/include/asm/mmu.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--
1.8.1.4
Apologies for the noise, I resent this series but neglected to place
the word "PATCH" in the subject lines of the patches thus the patches
got blocked by the list server. Here is another go at a resend.
---
The following patches bring both HugeTLB support and Transparent
HugePage (THP) support to ARM for 2 levels of paging (i.e. without
LPAE).
This code has been tested on an Arndale board (Exynos 5250), and
is based on 3.11-rc1, with the hugepage simplification patch:
http://lists.infradead.org/pipermail/linux-arm-kernel/2013-July/184117.html
HugeTLB and THP support for LPAE has already been merged in 3.11-rc1.
Hugepages can give nice performance boosts to workloads that put
pressure on the TLBs. I've observed uplifts of ~5% to some tasks
just by enabling hugepages via the libhugetlbfs tools. Other
people have observed decent performance boosts when huge pages
are enabled:
http://lists.infradead.org/pipermail/linux-arm-kernel/2013-February/148835.…
I would appreciate any discussion on these patches, as there are
people who have an interest in having huge page support in
non-LPAE kernels.
Steve Capper (2):
ARM: mm: HugeTLB support for non-LPAE systems.
ARM: mm: Transparent huge page support for non-LPAE systems.
arch/arm/Kconfig | 4 +-
arch/arm/include/asm/hugetlb-2level.h | 126 +++++++++++++++++++++++++
arch/arm/include/asm/hugetlb.h | 4 +
arch/arm/include/asm/pgtable-2level.h | 170 ++++++++++++++++++++++++++++++++++
arch/arm/include/asm/pgtable-3level.h | 6 ++
arch/arm/include/asm/pgtable.h | 7 +-
arch/arm/include/asm/tlb.h | 10 +-
arch/arm/kernel/head.S | 10 +-
arch/arm/mm/fault.c | 13 ---
arch/arm/mm/fsr-2level.c | 4 +-
arch/arm/mm/hugetlbpage.c | 2 +-
arch/arm/mm/mmu.c | 27 ++++++
12 files changed, 360 insertions(+), 23 deletions(-)
create mode 100644 arch/arm/include/asm/hugetlb-2level.h
--
1.8.1.4
Hello,
I'm resending this series again, to try and provoke some discussion on
them.
Cheers,
--
Steve
---
The following patches bring both HugeTLB support and Transparent
HugePage (THP) support to ARM for 2 levels of paging (i.e. without
LPAE).
This code has been tested on an Arndale board (Exynos 5250), and
is based on 3.11-rc1, with the hugepage simplification patch:
http://lists.infradead.org/pipermail/linux-arm-kernel/2013-July/184117.html
HugeTLB and THP support for LPAE has already been merged in 3.11-rc1.
Hugepages can give nice performance boosts to workloads that put
pressure on the TLBs. I've observed uplifts of ~5% to some tasks
just by enabling hugepages via the libhugetlbfs tools. Other
people have observed decent performance boosts when huge pages
are enabled:
http://lists.infradead.org/pipermail/linux-arm-kernel/2013-February/148835.…
I would appreciate any discussion on these patches, as there are
people who have an interest in having huge page support in
non-LPAE kernels.
Steve Capper (2):
ARM: mm: HugeTLB support for non-LPAE systems.
ARM: mm: Transparent huge page support for non-LPAE systems.
arch/arm/Kconfig | 4 +-
arch/arm/include/asm/hugetlb-2level.h | 126 +++++++++++++++++++++++++
arch/arm/include/asm/hugetlb.h | 4 +
arch/arm/include/asm/pgtable-2level.h | 170 ++++++++++++++++++++++++++++++++++
arch/arm/include/asm/pgtable-3level.h | 6 ++
arch/arm/include/asm/pgtable.h | 7 +-
arch/arm/include/asm/tlb.h | 10 +-
arch/arm/kernel/head.S | 10 +-
arch/arm/mm/fault.c | 13 ---
arch/arm/mm/fsr-2level.c | 4 +-
arch/arm/mm/hugetlbpage.c | 2 +-
arch/arm/mm/mmu.c | 27 ++++++
12 files changed, 360 insertions(+), 23 deletions(-)
create mode 100644 arch/arm/include/asm/hugetlb-2level.h
--
1.8.1.4
From: Mark Brown <broonie(a)linaro.org>
Within a DAPM sequence we normally don't care about when exactly a register
write has completed so long as they happen in the order we requested. This
means that we can issue most of the writes we do asynchronously which
should maximise the ability of the underlying frameworks to keep the
hardware busy, providing a small performance improvement on some systems.
We currently ensure that all writes are completed both when changing to a
different device and when calling into the regulator and clock frameworks.
This should ensure that the previous ordering is maintained.
We also ensure that writes are completed prior to calling into widget
event functions since some event functions implement delays. This
should be improved in future so that widgets can disable this sync in
order to add extra writes.
Signed-off-by: Mark Brown <broonie(a)linaro.org>
xvdfvsd
---
sound/soc/soc-dapm.c | 24 ++++++++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 25b9554..905e4c5 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -409,6 +409,12 @@ static inline void soc_widget_unlock(struct snd_soc_dapm_widget *w)
mutex_unlock(&w->platform->mutex);
}
+static void soc_dapm_async_complete(struct snd_soc_dapm_context *dapm)
+{
+ if (dapm->codec && dapm->codec->using_regmap)
+ regmap_async_complete(dapm->codec->control_data);
+}
+
static int soc_widget_update_bits_locked(struct snd_soc_dapm_widget *w,
unsigned short reg, unsigned int mask, unsigned int value)
{
@@ -417,8 +423,9 @@ static int soc_widget_update_bits_locked(struct snd_soc_dapm_widget *w,
int ret;
if (w->codec && w->codec->using_regmap) {
- ret = regmap_update_bits_check(w->codec->control_data,
- reg, mask, value, &change);
+ ret = regmap_update_bits_check_async(w->codec->control_data,
+ reg, mask, value,
+ &change);
if (ret != 0)
return ret;
} else {
@@ -1201,6 +1208,8 @@ int dapm_regulator_event(struct snd_soc_dapm_widget *w,
{
int ret;
+ soc_dapm_async_complete(w->dapm);
+
if (SND_SOC_DAPM_EVENT_ON(event)) {
if (w->on_val & SND_SOC_DAPM_REGULATOR_BYPASS) {
ret = regulator_allow_bypass(w->regulator, false);
@@ -1234,6 +1243,8 @@ int dapm_clock_event(struct snd_soc_dapm_widget *w,
if (!w->clk)
return -EIO;
+ soc_dapm_async_complete(w->dapm);
+
#ifdef CONFIG_HAVE_CLK
if (SND_SOC_DAPM_EVENT_ON(event)) {
return clk_prepare_enable(w->clk);
@@ -1426,6 +1437,7 @@ static void dapm_seq_check_event(struct snd_soc_card *card,
if (w->event && (w->event_flags & event)) {
pop_dbg(w->dapm->dev, card->pop_time, "pop test : %s %s\n",
w->name, ev_name);
+ soc_dapm_async_complete(w->dapm);
trace_snd_soc_dapm_widget_event_start(w, event);
ret = w->event(w, NULL, event);
trace_snd_soc_dapm_widget_event_done(w, event);
@@ -1498,6 +1510,7 @@ static void dapm_seq_run(struct snd_soc_card *card,
struct list_head *list, int event, bool power_up)
{
struct snd_soc_dapm_widget *w, *n;
+ struct snd_soc_dapm_context *d;
LIST_HEAD(pending);
int cur_sort = -1;
int cur_subseq = -1;
@@ -1528,6 +1541,9 @@ static void dapm_seq_run(struct snd_soc_card *card,
cur_subseq);
}
+ if (cur_dapm && w->dapm != cur_dapm)
+ soc_dapm_async_complete(cur_dapm);
+
INIT_LIST_HEAD(&pending);
cur_sort = -1;
cur_subseq = INT_MIN;
@@ -1586,6 +1602,10 @@ static void dapm_seq_run(struct snd_soc_card *card,
cur_dapm->seq_notifier(cur_dapm,
i, cur_subseq);
}
+
+ list_for_each_entry(d, &card->dapm_list, list) {
+ soc_dapm_async_complete(d);
+ }
}
static void dapm_widget_update(struct snd_soc_card *card)
--
1.8.4.rc3
On Mon, Oct 07, 2013 at 09:37:19PM -0700, Victor Kamensky wrote:
> In big endian mode mcpm_entry_point is first function
> that called on secondaries CPU. First it should switch
> CPU into big endian code.
>
> Signed-off-by: Victor Kamensky <victor.kamensky(a)linaro.org>
Providing Nico's also OK with it, I don't see a problem with this.
Minor cosmetic nit: please line up the ) after be with the others.
Not the end of the world, though.
Reviewed-by: Dave Martin <Dave.Martin(a)arm.com>
Cheers
---Dave
> ---
> arch/arm/common/mcpm_head.S | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S
> index 39c96df..4f88f5e 100644
> --- a/arch/arm/common/mcpm_head.S
> +++ b/arch/arm/common/mcpm_head.S
> @@ -15,6 +15,7 @@
>
> #include <linux/linkage.h>
> #include <asm/mcpm.h>
> +#include <asm/assembler.h>
>
> #include "vlock.h"
>
> @@ -47,6 +48,7 @@
>
> ENTRY(mcpm_entry_point)
>
> + ARM_BE8(setend be)
> THUMB( adr r12, BSYM(1f) )
> THUMB( bx r12 )
> THUMB( .thumb )
> --
> 1.8.1.4
>
>
> _______________________________________________
> linaro-kernel mailing list
> linaro-kernel(a)lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/linaro-kernel
From: Mark Brown <broonie(a)linaro.org>
Since it is quite common for single register raw or async writes to be
generated by rbtree cache syncs or firmware downloads and essentially all
hardware will be faster with only a single transfer optimise this case by
copying single values into the internal scratch buffer before sending.
Signed-off-by: Mark Brown <broonie(a)linaro.org>
---
drivers/base/regmap/regmap.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 5754513..4866ae5 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -1118,6 +1118,16 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
u8[0] |= map->write_flag_mask;
+ /*
+ * Essentially all I/O mechanisms will be faster with a single
+ * buffer to write. Since register syncs often generate raw
+ * writes of single registers optimise that case.
+ */
+ if (val != work_val && val_len == map->format.val_bytes) {
+ memcpy(work_val, val, map->format.val_bytes);
+ val = work_val;
+ }
+
if (async && map->bus->async_write) {
struct regmap_async *async;
--
1.8.4.rc3
On ARM the debug info is not present in the .eh_frame sections but
in .debug_frame instead, in dwarf format.
This patch set uses libunwind to load and parse the dwarf debug info from
the .debug_frame section if no .eh_frame_hdr section is found; also it
sets the hooks in the perf_regs and libunwind code for ARMv7.
Dependencies:
. if present, libunwind >= 1.1 is needed to prevent a segfault when
parsing the dwarf info,
. libunwind needs to be configured with --enable-debug-frame. Note:
--enable-debug-frame is automatically selected on ARM.
The generated perf binary has been tested on ARMv7 (OMAP4, Marvell
Armada XP) and x86_64, using the following commands:
perf record -g [dwarf] -- <binary>
perf report --sort symbol --call-graph --stdio
Jean Pihet (2):
perf tools: Check libunwind for availability of dwarf parsing feature
perf: parse the .debug_frame section in case .eh_frame is not present
Will Deacon (2):
ARM: perf: add support for perf registers API
ARM: perf: wire up perf_regs and unwind support for ARM
arch/arm/Kconfig | 2 +
arch/arm/include/uapi/asm/Kbuild | 1 +
arch/arm/include/uapi/asm/perf_regs.h | 23 ++++++++++
arch/arm/kernel/Makefile | 1 +
arch/arm/kernel/perf_regs.c | 30 +++++++++++++
tools/perf/arch/arm/Makefile | 3 ++
tools/perf/arch/arm/include/perf_regs.h | 54 ++++++++++++++++++++++++
tools/perf/arch/arm/util/unwind.c | 48 +++++++++++++++++++++
tools/perf/config/Makefile | 13 ++++--
tools/perf/config/feature-tests.mak | 21 ++++++++-
tools/perf/util/unwind.c | 75 ++++++++++++++++++++++++++-------
11 files changed, 251 insertions(+), 20 deletions(-)
create mode 100644 arch/arm/include/uapi/asm/perf_regs.h
create mode 100644 arch/arm/kernel/perf_regs.c
create mode 100644 tools/perf/arch/arm/include/perf_regs.h
create mode 100644 tools/perf/arch/arm/util/unwind.c
--
1.7.11.7
The CCI PMU is not a CPU PMU. As such the CCI PMU events can be
initiate from any proocessor. Set the valid_cpus mask to indicate
this.
Signed-off-by: Punit Agrawal <punit.agrawal(a)arm.com>
---
(using the correct list-address. Please ignore the previous mail if you
get a duplicate)
Hi Tixy,
Please pick this patch - it is required for the Linaro kernels to be
able to use CCI PMU.
The patch is based on top of your integration-linux-vexpress branch.
Cheers,
Punit
drivers/bus/arm-cci.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/bus/arm-cci.c b/drivers/bus/arm-cci.c
index 57b0bc6..7363c7e 100644
--- a/drivers/bus/arm-cci.c
+++ b/drivers/bus/arm-cci.c
@@ -545,6 +545,7 @@ static int cci_pmu_init(struct arm_pmu *cci_pmu, struct platform_device *pdev)
cci_pmu->plat_device = pdev;
cci_pmu->num_events = pmu_get_max_counters();
+ cpumask_setall(&cci_pmu->valid_cpus);
return armpmu_register(cci_pmu, -1);
}
--
1.7.10.4
Hi Ben,
Here are couple more big endian related fixes. We run into these
with vexpress TC2 board, when CONFIG_MCPM and CONFIG_ARM_CCI configs
are enabled. Big endian image fails to boot. With fixes BE TC2
boots and it sees all 5 cores (2xA15, 3xA7).
These were tested with vexpress TC2 on latest linux-linaro
branch (include BE patch series) and it was tested on very
recent rmk/fixes branch along with BE series on top of it.
Thanks,
Victor
Victor Kamensky (2):
ARM: mcpm: fix big endian issue in mcpm startup code
ARM: cci: driver need big endian fixes in asm code
arch/arm/common/mcpm_head.S | 2 ++
drivers/bus/arm-cci.c | 6 ++++++
2 files changed, 8 insertions(+)
--
1.8.1.4
Hi Will,
Could you please review patch following this cover letter. I've run
into this problem when Linaro big endian topic branch received 3.12-rc2
update and as result got your f0915781bd5edf78b1154e61efe962dc15872d09
"ARM: tlb: don't perform inner-shareable invalidation for local TLB ops"
commit. My Big Endian Arndale image was getting very weird user-land
crashes while fork glib call was performed.
It turns out that in __flush_tlb_mm function code passes 'ASID(mm)'
directly to tlb_op macro, unlike in other functions (i.e local_flush_tlb_mm)
where asid initially assigned to local variable with 'int' type and then
such variable is passed to tlb_op macro. Direct use of 'ASID(mm)'
in tlb_op macro does not work in big endian case. Resulting generated
code look like this (from flush_tlb_mm that uses __flush_tlb_mm):
(gdb) disassemble flush_tlb_mm
Dump of assembler code for function flush_tlb_mm:
<snip>
0x80011984 <+32>: dsb ishst
0x80011988 <+36>: movs r4, #0 <---------------------
0x8001198a <+38>: ldrb.w r5, [r2, #375] ; 0x177
0x8001198e <+42>: ldr r3, [r3, #8]
0x80011990 <+44>: tst.w r3, #65536 ; 0x10000
0x80011994 <+48>: it ne
0x80011996 <+50>: mcrne 15, 0, r5, cr8, cr7, {2}
0x8001199a <+54>: tst.w r1, #4194304 ; 0x400000
0x8001199e <+58>: it ne
0x800119a0 <+60>: mcrne 15, 0, r4, cr8, cr3, {2} <-------
0x800119a4 <+64>: dsb ish
'15, 0, r4, cr8, cr3, {2}' receives 0 through r4 register.
Note that in LE case correct code is generated.
If I change code to use intermediate int variable asid as
other places do, correct code is generated for __flush_tlb_mm.
My explanation is that ASID macro actually produces
64 bit integer. When passed to inline assembler as 'r'
operand is not really defined behaviour and it is
wrong in BE case. When intermidiate 'int' variable is used
ASID macro value is properly converted to 32 bit integer.
And inline assembler received int type, which works
correctly.
Note other possible ways to fix it is to add 'int' cast
either outside of ASID macro call or inside of it.
Personally I prefer variant that follows this cover
letter, because it is similar to other cases where
such code pattern is used.
I've tried to understand whether proposed fix is really
workaround for compiler bug. I failed to find clear
explanation how gcc need to handle such situation.
Here is quote from gcc manual:
> The compiler cannot
> check whether the operands have data types that are reasonable for the
> instruction being executed. It does not parse the assembler instruction
> template and does not know what it means or even whether it is valid
> assembler input. The extended `asm' feature is most often used for
> machine instructions the compiler itself does not know exist. If the
> output expression cannot be directly addressed (for example, it is a
> bit-field), your constraint must allow a register. In that case, GCC
> will use the register as the output of the `asm', and then store that
> register into the output.
It is not clear whether 'unsigned long long' is
"reasonable" type for arm "r" inline asm operand. So
it seems it is gray area. Personally I suspect this
issue came up before because in many places of tlbflush.h
'zero' local variable is used and 'asid' local
variable is used.
Here is small test case that illustrate code
generation issue and difference between LE and BE cases:
[kamensky@kamensky-w530 asid_inline]$ cat asid_inline.c
typedef unsigned long long u64;
struct mm_struct {
unsigned long dummy1;
u64 __attribute__((aligned(8))) counter;
};
void test1(struct mm_struct *mm)
{
const int asid = ((mm)->counter & ~((~0ULL) << 8));
do {
asm("mcr " "p15, 0, %0, " "c8, c3, 2" : : "r" (asid) : "cc");
} while (0);
}
void test2(struct mm_struct *mm)
{
do {
asm("mcr " "p15, 0, %0, " "c8, c3, 2" : : "r" (((mm)->counter & ~((~0ULL) << 8))) : "cc");
} while (0);
}
void test3(struct mm_struct *mm)
{
do {
asm("mcr " "p15, 0, %0, " "c8, c3, 2" : : "r" ((int)((mm)->counter & ~((~0ULL) << 8))) : "cc");
} while (0);
}
[kamensky@kamensky-w530 asid_inline]$ ./asid_inline.sh
+ arm-linux-gnueabihf-gcc -nostdinc -mbig-endian -O2 -mabi=aapcs-linux -mno-thumb-interwork -mthumb -march=armv7-a -msoft-float -c -o asid_inline.be.o asid_inline.c
+ arm-linux-gnueabihf-objdump --disassemble --reloc asid_inline.be.o
asid_inline.be.o: file format elf32-bigarm
Disassembly of section .text:
00000000 <test1>:
0: 7bc3 ldrb r3, [r0, #15]
2: ee08 3f53 mcr 15, 0, r3, cr8, cr3, {2}
6: 4770 bx lr
00000008 <test2>:
8: 7bc3 ldrb r3, [r0, #15]
a: 2200 movs r2, #0
c: ee08 2f53 mcr 15, 0, r2, cr8, cr3, {2}
10: 4770 bx lr
12: bf00 nop
00000014 <test3>:
14: 7bc3 ldrb r3, [r0, #15]
16: ee08 3f53 mcr 15, 0, r3, cr8, cr3, {2}
1a: 4770 bx lr
+ arm-linux-gnueabihf-gcc -nostdinc -mlittle-endian -O2 -mabi=aapcs-linux -mno-thumb-interwork -mthumb -march=armv7-a -msoft-float -c -o asid_inline.le.o asid_inline.c
+ arm-linux-gnueabihf-objdump --disassemble --reloc asid_inline.le.o
asid_inline.le.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <test1>:
0: 7a03 ldrb r3, [r0, #8]
2: ee08 3f53 mcr 15, 0, r3, cr8, cr3, {2}
6: 4770 bx lr
00000008 <test2>:
8: 7a02 ldrb r2, [r0, #8]
a: 2300 movs r3, #0
c: ee08 2f53 mcr 15, 0, r2, cr8, cr3, {2}
10: 4770 bx lr
12: bf00 nop
00000014 <test3>:
14: 7a03 ldrb r3, [r0, #8]
16: ee08 3f53 mcr 15, 0, r3, cr8, cr3, {2}
1a: 4770 bx lr
[kamensky@kamensky-w530 asid_inline]$
Thanks,
Victor
Victor Kamensky (1):
ARM: tlb: __flush_tlb_mm need to use int asid var for BE correct
operation
arch/arm/include/asm/tlbflush.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--
1.8.1.4
This patchset adds support for basic kernel probes(kprobes), jump probes(jprobes)
and return probes(kretprobes) support for AArch64.
This kprobes mechanism make use of software breakpoint and single stepping
support available in ARM v8 kernel.
Basic verification is done with sample test modules available as part of
"samples/kprobes/" running on ARM v8 fast model (RTSM).
Patch 1 (AArch64-Add-single-step-and-breakpoint-handler-hooks.patch) is v3 version of:
http://permalink.gmane.org/gmane.linux.ports.arm.kernel/269733
Changes:
v2 -> v3
- Renamed break_lock to break_hook_lock
- Use rcu protected list traversal for step_hook
- eliminated addr argument for debug hooks, now callback functions shall extract address from pt_regs instead.
- refined entry.S changes only to handler 'BRK64' esr value.
Patch 2 (arm64-Kernel-code-patching-support.patch) implement basic code patching support needed for kprobes.
Similar api is published earlier on LKML/LAKML as part of jump label support: https://lkml.org/lkml/2013/9/25/250
However, for kprobes some changes required with that version, can rebase on new version of patch from Jiang.
Sandeepa Prabhu (5):
AArch64: Add single-step and breakpoint handler hooks
arm64: Kernel code patching support
AArch64: Instruction simulation and decode support
AArch64: Add Kprobes support for ARM v8 kernel
AArch64: Support kretprobe support for ARM v8
arch/arm64/Kconfig | 2 +
arch/arm64/include/asm/debug-monitors.h | 23 ++
arch/arm64/include/asm/kprobes.h | 58 +++
arch/arm64/include/asm/probes.h | 48 +++
arch/arm64/include/asm/ptrace.h | 6 +
arch/arm64/kernel/Makefile | 2 +
arch/arm64/kernel/debug-monitors.c | 85 ++++-
arch/arm64/kernel/entry.S | 2 +
arch/arm64/kernel/kprobes-arm64.c | 245 ++++++++++++
arch/arm64/kernel/kprobes-arm64.h | 26 ++
arch/arm64/kernel/kprobes.c | 642 ++++++++++++++++++++++++++++++++
arch/arm64/kernel/kprobes.h | 28 ++
arch/arm64/kernel/patch.c | 58 +++
arch/arm64/kernel/patch.h | 20 +
arch/arm64/kernel/probes-aarch64.c | 235 ++++++++++++
arch/arm64/kernel/probes-aarch64.h | 127 +++++++
arch/arm64/kernel/probes-common.c | 117 ++++++
arch/arm64/kernel/vmlinux.lds.S | 1 +
18 files changed, 1722 insertions(+), 3 deletions(-)
create mode 100644 arch/arm64/include/asm/kprobes.h
create mode 100644 arch/arm64/include/asm/probes.h
create mode 100644 arch/arm64/kernel/kprobes-arm64.c
create mode 100644 arch/arm64/kernel/kprobes-arm64.h
create mode 100644 arch/arm64/kernel/kprobes.c
create mode 100644 arch/arm64/kernel/kprobes.h
create mode 100644 arch/arm64/kernel/patch.c
create mode 100644 arch/arm64/kernel/patch.h
create mode 100644 arch/arm64/kernel/probes-aarch64.c
create mode 100644 arch/arm64/kernel/probes-aarch64.h
create mode 100644 arch/arm64/kernel/probes-common.c
--
1.8.1.2
The sleep_length is computed in the tick_nohz_stop_sched_tick function but it
is used later in the code with in between the local irq enabled.
cpu_idle_loop
tick_nohz_idle_enter [ exits with local irq enabled ]
__tick_nohz_idle_enter
tick_nohz_stop_sched_tick
...
arch_cpu_idle
menu_select [ uses here 'sleep_length' ]
...
Between the computation of the sleep length and its usage, some interrupts
may occur, making the sleep length shorter than actually it is because of the
interrupt processing, or different if the timer itself expired.
This patch fixes that by moving the sleep_length computation in the
tick_nohz_get_sleep_length function and using the tick device's next_event.
As the sleep_length field is no longer needed, it is removed from the
tick_sched structure.
Signed-off-by: Daniel Lezcano <daniel.lezcano(a)linaro.org>
Signed-off-by: Stephen Boyd <sboyd(a)codeaurora.org>
---
include/linux/tick.h | 2 --
kernel/time/tick-sched.c | 5 +++--
2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 5128d33..53dbbd7 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -48,7 +48,6 @@ enum tick_nohz_mode {
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @sleep_length: Duration of the current idle sleep
* @do_timer_lst: CPU was the last one doing do_timer before going idle
*/
struct tick_sched {
@@ -67,7 +66,6 @@ struct tick_sched {
ktime_t idle_exittime;
ktime_t idle_sleeptime;
ktime_t iowait_sleeptime;
- ktime_t sleep_length;
unsigned long last_jiffies;
unsigned long next_jiffies;
ktime_t idle_expires;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3612fc7..60b1dcd 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -673,7 +673,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
out:
ts->next_jiffies = next_jiffies;
ts->last_jiffies = last_jiffies;
- ts->sleep_length = ktime_sub(dev->next_event, now);
return ret;
}
@@ -837,8 +836,10 @@ void tick_nohz_irq_exit(void)
ktime_t tick_nohz_get_sleep_length(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+ ktime_t now = ktime_get();
- return ts->sleep_length;
+ return ktime_sub(dev->next_event, now);
}
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
--
1.7.9.5