From bcc2c9c3fff859e0eb019fe6fec26f9b8eba795c Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Thu, 31 May 2012 16:40:06 +0200
Subject: Tools: hv: verify origin of netlink connector message

The SuSE security team suggested to use recvfrom instead of recv to be
certain that the connector message is originated from kernel.

CVE-2012-2669

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Marcus Meissner <meissner@suse.de>
Signed-off-by: Sebastian Krahmer <krahmer@suse.de>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/hv/hv_kvp_daemon.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index 146fd6147e84..d9834b362943 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c
@@ -701,14 +701,18 @@ int main(void)
 	pfd.fd = fd;
 
 	while (1) {
+		struct sockaddr *addr_p = (struct sockaddr *) &addr;
+		socklen_t addr_l = sizeof(addr);
 		pfd.events = POLLIN;
 		pfd.revents = 0;
 		poll(&pfd, 1, -1);
 
-		len = recv(fd, kvp_recv_buffer, sizeof(kvp_recv_buffer), 0);
+		len = recvfrom(fd, kvp_recv_buffer, sizeof(kvp_recv_buffer), 0,
+				addr_p, &addr_l);
 
-		if (len < 0) {
-			syslog(LOG_ERR, "recv failed; error:%d", len);
+		if (len < 0 || addr.nl_pid) {
+			syslog(LOG_ERR, "recvfrom failed; pid:%u error:%d %s",
+					addr.nl_pid, errno, strerror(errno));
 			close(fd);
 			return -1;
 		}
-- 
cgit v1.2.3


From 80c0120a3cca30166c0ab8b24e44be67e97b79af Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 8 Jun 2012 11:47:51 -0300
Subject: perf tools: Fix endianity swapping for adds_features bitmask

Based on Jiri's latest attempt:
https://lkml.org/lkml/2012/5/16/61

Basically, adds_features should be byte swapped assuming unsigned
longs are either 8-bytes (u64) or 4-bytes (u32).

    Fixes 32-bit ppc dumping 64-bit x86 feature data:
     ========
     captured on: Sun May 20 19:23:23 2012
     hostname : nxos-vdc-dev3
     os release : 3.4.0-rc7+
     perf version : 3.4.rc4.137.g978da3
     arch : x86_64
     nrcpus online : 16
     nrcpus avail : 16
     cpudesc : Intel(R) Xeon(R) CPU E5540 @ 2.53GHz
     cpuid : GenuineIntel,6,26,5
     total memory : 24680324 kB
    ...

Verified 64-bit x86 can still dump feature data for 32-bit ppc.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Jiri Olsa <jolsa@redhat.com>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/4FBBB539.5010805@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/header.c               | 16 +++++++++-------
 tools/perf/util/include/linux/bitops.h |  2 ++
 tools/perf/util/session.c              | 10 ++++++++++
 tools/perf/util/session.h              |  1 +
 4 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 2dd5edf161b7..4f9b247fb312 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1942,7 +1942,6 @@ int perf_file_header__read(struct perf_file_header *header,
 		else
 			return -1;
 	} else if (ph->needs_swap) {
-		unsigned int i;
 		/*
 		 * feature bitmap is declared as an array of unsigned longs --
 		 * not good since its size can differ between the host that
@@ -1958,14 +1957,17 @@ int perf_file_header__read(struct perf_file_header *header,
 		 * file), punt and fallback to the original behavior --
 		 * clearing all feature bits and setting buildid.
 		 */
-		for (i = 0; i < BITS_TO_LONGS(HEADER_FEAT_BITS); ++i)
-			header->adds_features[i] = bswap_64(header->adds_features[i]);
+		mem_bswap_64(&header->adds_features,
+			    BITS_TO_U64(HEADER_FEAT_BITS));
 
 		if (!test_bit(HEADER_HOSTNAME, header->adds_features)) {
-			for (i = 0; i < BITS_TO_LONGS(HEADER_FEAT_BITS); ++i) {
-				header->adds_features[i] = bswap_64(header->adds_features[i]);
-				header->adds_features[i] = bswap_32(header->adds_features[i]);
-			}
+			/* unswap as u64 */
+			mem_bswap_64(&header->adds_features,
+				    BITS_TO_U64(HEADER_FEAT_BITS));
+
+			/* unswap as u32 */
+			mem_bswap_32(&header->adds_features,
+				    BITS_TO_U32(HEADER_FEAT_BITS));
 		}
 
 		if (!test_bit(HEADER_HOSTNAME, header->adds_features)) {
diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h
index f1584833bd22..587a230d2075 100644
--- a/tools/perf/util/include/linux/bitops.h
+++ b/tools/perf/util/include/linux/bitops.h
@@ -8,6 +8,8 @@
 #define BITS_PER_LONG __WORDSIZE
 #define BITS_PER_BYTE           8
 #define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+#define BITS_TO_U64(nr)         DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64))
+#define BITS_TO_U32(nr)         DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u32))
 
 #define for_each_set_bit(bit, addr, size) \
 	for ((bit) = find_first_bit((addr), (size));		\
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 2600916efa83..c3e399bcf18d 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -442,6 +442,16 @@ static void perf_tool__fill_defaults(struct perf_tool *tool)
 			tool->finished_round = process_finished_round_stub;
 	}
 }
+ 
+void mem_bswap_32(void *src, int byte_size)
+{
+	u32 *m = src;
+	while (byte_size > 0) {
+		*m = bswap_32(*m);
+		byte_size -= sizeof(u32);
+		++m;
+	}
+}
 
 void mem_bswap_64(void *src, int byte_size)
 {
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 7a5434c00565..0c702e3f0a36 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -80,6 +80,7 @@ struct branch_info *machine__resolve_bstack(struct machine *self,
 bool perf_session__has_traces(struct perf_session *self, const char *msg);
 
 void mem_bswap_64(void *src, int byte_size);
+void mem_bswap_32(void *src, int byte_size);
 void perf_event__attr_swap(struct perf_event_attr *attr);
 
 int perf_session__create_kernel_maps(struct perf_session *self);
-- 
cgit v1.2.3


From fc3e4d077d5c7a7bc1ad5bc143895b4e070e5a8b Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 15 May 2012 13:11:11 +0200
Subject: perf stat: Fix default output file

The following commit:

commit 56f3bae70638b33477a6015fd362ccfe354fd3ee
Author: Jim Cromie <jim.cromie@gmail.com>
Date:   Wed Sep 7 17:14:00 2011 -0600

    perf stat: Add --log-fd <N> option to redirect stderr elsewhere

introduced a bug in the way perf stat outputs the results by default,
i.e., without the --log-fd or --output option. It would default to
writing to file descriptor 0, i.e., stdin. Writing to stdin is allowed
and is equivalent to writing to stdout. However, there is a major
difference for any script that was already capturing the output of perf
stat via redirection:

    perf stat >/tmp/log .... or perf stat 2>/tmp/log ....

They would not capture anything anymore. They would have to do:
    perf stat 0>/tmp/log ...

This breaks compatibility with existing scripts and does not look very
natural.

This patch fixes the problem by looking at output_fd only when it was
modified by user (> 0). It also checks that the value if positive.
Passing --log-fd 0 is ignored.

I would also argue that defaulting to stderr for the results is not the
right thing to do, though this patch does not address this specific
issue.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jim Cromie <jim.cromie@gmail.com>
Link: http://lkml.kernel.org/r/20120515111111.GA9870@quad
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 262589991ea4..07b5c7703dd1 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1179,6 +1179,12 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 		fprintf(stderr, "cannot use both --output and --log-fd\n");
 		usage_with_options(stat_usage, options);
 	}
+
+	if (output_fd < 0) {
+		fprintf(stderr, "argument to --log-fd must be a > 0\n");
+		usage_with_options(stat_usage, options);
+	}
+
 	if (!output) {
 		struct timespec tm;
 		mode = append_file ? "a" : "w";
@@ -1190,7 +1196,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 		}
 		clock_gettime(CLOCK_REALTIME, &tm);
 		fprintf(output, "# started on %s\n", ctime(&tm.tv_sec));
-	} else if (output_fd != 2) {
+	} else if (output_fd > 0) {
 		mode = append_file ? "a" : "w";
 		output = fdopen(output_fd, mode);
 		if (!output) {
-- 
cgit v1.2.3


From cb9dd49e11f83d548c822d7022ac180b0518b25c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 11 Jun 2012 19:03:32 -0300
Subject: perf tools: Fix synthesizing tracepoint names from the perf.data
 headers

We need to use the per event info snapshoted at record time to
synthesize the events name, so do it just after reading the perf.data
headers, when we already processed the /sys events data, otherwise we'll
end up using the local /sys that only by sheer luck will have the same
tracepoint ID -> real event association.

Example:

  # uname -a
  Linux felicio.ghostprotocols.net 3.4.0-rc5+ #1 SMP Sat May 19 15:27:11 BRT 2012 x86_64 x86_64 x86_64 GNU/Linux
  # perf record -e sched:sched_switch usleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.015 MB perf.data (~648 samples) ]
  # cat /t/events/sched/sched_switch/id
  279
  # perf evlist -v
  sched:sched_switch: sample_freq=1, type: 2, config: 279, size: 80, sample_type: 1159, read_format: 7, disabled: 1, inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1
  #

So on the above machine the sched:sched_switch has tracepoint id 279, but on
the machine were we'll analyse it it has a different id:

  $ cat /t/events/sched/sched_switch/id
  56
  $ perf evlist -i /tmp/perf.data
  kmem:mm_balancedirty_writeout
  $ cat /t/events/kmem/mm_balancedirty_writeout/id
  279

With this fix:

  $ perf evlist -i /tmp/perf.data
  sched:sched_switch

Reported-by: Dmitry Antipov <dmitry.antipov@linaro.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/n/tip-auwks8fpuhmrdpiefs55o5oz@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/header.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 4f9b247fb312..e909d43cf542 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2093,6 +2093,35 @@ static int read_attr(int fd, struct perf_header *ph,
 	return ret <= 0 ? -1 : 0;
 }
 
+static int perf_evsel__set_tracepoint_name(struct perf_evsel *evsel)
+{
+	struct event_format *event = trace_find_event(evsel->attr.config);
+	char bf[128];
+
+	if (event == NULL)
+		return -1;
+
+	snprintf(bf, sizeof(bf), "%s:%s", event->system, event->name);
+	evsel->name = strdup(bf);
+	if (event->name == NULL)
+		return -1;
+
+	return 0;
+}
+
+static int perf_evlist__set_tracepoint_names(struct perf_evlist *evlist)
+{
+	struct perf_evsel *pos;
+
+	list_for_each_entry(pos, &evlist->entries, node) {
+		if (pos->attr.type == PERF_TYPE_TRACEPOINT &&
+		    perf_evsel__set_tracepoint_name(pos))
+			return -1;
+	}
+
+	return 0;
+}
+
 int perf_session__read_header(struct perf_session *session, int fd)
 {
 	struct perf_header *header = &session->header;
@@ -2174,6 +2203,9 @@ int perf_session__read_header(struct perf_session *session, int fd)
 
 	lseek(fd, header->data_offset, SEEK_SET);
 
+	if (perf_evlist__set_tracepoint_names(session->evlist))
+		goto out_delete_evlist;
+
 	header->frozen = 1;
 	return 0;
 out_errno:
-- 
cgit v1.2.3


From 76a8349dfdb775d387e9767db3092e410403138a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 14 Jun 2012 12:36:17 -0600
Subject: perf script: Fix format regression due to libtraceevent merge

Consider the commands:
    perf record -e sched:sched_switch -fo /tmp/perf.data  -a -- sleep 1
    perf script -i /tmp/perf.data

In v3.4 the output has the form (lines wrapped here)
    perf 29214 [005] 821043.582596: sched_switch:
prev_comm=perf prev_pid=29214 prev_prio=120
prev_state=S ==> next_comm=swapper/5 next_pid=0 next_prio=120

In 3.5 that same line has become:
    perf 29214 [005] 821043.582596: sched_switch:
<...>-29214 [005]     0.000000000: sched_switch:
prev_comm=perf prev_pid=29214 prev_prio=120
prev_state=S ==> next_comm=swapper/5 next_pid=0 next_prio=120

Note the duplicates in the output -- pid, cpu, event name. With
this patch the v3.4 output is restored:
    perf 29214 [005] 821043.582596: sched_switch:
prev_comm=perf prev_pid=29214 prev_prio=120
prev_state=S ==> next_comm=swapper/5 next_pid=0 next_prio=120

v3:
Remove that pesky newline too. Output now matches v3.4 (pre-libtracevent).

v2:
Change print_trace_event function local to perf per Steve's comments.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1339698977-68962-1-git-send-email-dsahern@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/trace-event-parse.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index df2fddbf0cd2..5dd3b5ec8411 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -198,9 +198,8 @@ void print_trace_event(int cpu, void *data, int size)
 	record.data = data;
 
 	trace_seq_init(&s);
-	pevent_print_event(pevent, &s, &record);
+	pevent_event_info(&s, event, &record);
 	trace_seq_do_printf(&s);
-	printf("\n");
 }
 
 void print_event(int cpu, void *data, int size, unsigned long long nsecs,
-- 
cgit v1.2.3


From 207b5792696206663a38e525b9793644895bad3b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 1 Jul 2012 16:11:37 -0600
Subject: perf kvm: Fix regression with guest machine creation

Commit 743eb868657bdb1b26c7b24077ca21c67c82c777 reworked when the
machines were created. Prior to this commit guest machines could be
created in perf_event__process_kernel_mmap() while processing kernel
MMAP events. This commit assumes that the machines exist by the time
perf_session_deliver_event is called (e.g., during processing of build
id events) - which is not always correct.

One example is the use of default guest args (--guestkallsyms and
--guestmodules) for short times where no samples hit within a guest
module. For this case no build id is added to the file header. No build
id == no machine created. That leads to the next example -- the use of
no-buildid (-B) on the record for all perf-kvm invocations. In both
cases perf report dies with a SEGFAULT of the form:

(gdb) bt
0  0x000000000046dd7b in machine__mmap_name (self=0x0, bf=0x7fffffffbd20 "q\021", size=4096) at util/map.c:715
1  0x0000000000444161 in perf_event__process_kernel_mmap (tool=0x7fffffffdd80, event=0x7ffff7fb4120, machine=0x0) at util/event.c:562
2  0x0000000000444642 in perf_event__process_mmap (tool=0x7fffffffdd80, event=0x7ffff7fb4120, sample=0x7fffffffd210, machine=0x0)
    at util/event.c:668
3  0x0000000000470e0b in perf_session_deliver_event (session=0x915ca0, event=0x7ffff7fb4120, sample=0x7fffffffd210, tool=0x7fffffffdd80,
    file_offset=8480) at util/session.c:979
4  0x000000000047032e in flush_sample_queue (s=0x915ca0, tool=0x7fffffffdd80) at util/session.c:679
5  0x0000000000471c8d in __perf_session__process_events (session=0x915ca0, data_offset=400, data_size=150448, file_size=150848, tool=
    0x7fffffffdd80) at util/session.c:1363
6  0x0000000000471d42 in perf_session__process_events (self=0x915ca0, tool=0x7fffffffdd80) at util/session.c:1379
7  0x000000000042484a in __cmd_report (rep=0x7fffffffdd80) at builtin-report.c:368
8  0x0000000000425bf1 in cmd_report (argc=0, argv=0x915b00, prefix=0x0) at builtin-report.c:756
9  0x0000000000438505 in __cmd_report (argc=4, argv=0x7fffffffe260) at builtin-kvm.c:84
10 0x000000000043882a in cmd_kvm (argc=4, argv=0x7fffffffe260, prefix=0x0) at builtin-kvm.c:131
11 0x00000000004152cd in run_builtin (p=0x7a54e8, argc=9, argv=0x7fffffffe260) at perf.c:273
12 0x00000000004154c7 in handle_internal_command (argc=9, argv=0x7fffffffe260) at perf.c:345
13 0x0000000000415613 in run_argv (argcp=0x7fffffffe14c, argv=0x7fffffffe140) at perf.c:389
14 0x0000000000415899 in main (argc=9, argv=0x7fffffffe260) at perf.c:487

Fix by allowing the machine to be created in perf_session_deliver_event.

Tested with --guestmount option and default guest args, with and without
-B arg on record for both and for short (10 seconds) and long (10
minutes) windows.

Reported-by: Pradeep Kumar Surisetty <psuriset@linux.vnet.ibm.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Pradeep Kumar Surisetty <psuriset@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1341180697-64515-1-git-send-email-dsahern@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/session.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index c3e399bcf18d..56142d0fb8d7 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -926,7 +926,7 @@ static struct machine *
 		else
 			pid = event->ip.pid;
 
-		return perf_session__find_machine(session, pid);
+		return perf_session__findnew_machine(session, pid);
 	}
 
 	return perf_session__find_host_machine(session);
-- 
cgit v1.2.3


From 7ed97ad41ffa94040dfd593948962a7e9e7b0db9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 2 Jul 2012 09:12:57 -0600
Subject: perf kvm: Fix segfault with report and mixed guestmount use

Using the guestmount option on record:
    $ perf kvm --guest --host --guestmount=/tmp/guest-mount record -ag

But not the subsequent report:
    $ perf kvm report

causes a SEGFAULT in the usual place:
(gdb) bt
0  0x0000000000470356 in machine__mmap_name (self=0x0, bf=0x7fffffffbdb0 " z\370\367\377\177", size=
    4096) at util/map.c:712
1  0x00000000004453e8 in perf_event__process_kernel_mmap (tool=0x7fffffffde10, event=0x7ffff7f87e38,
    machine=0x0) at util/event.c:550
2  0x00000000004458c9 in perf_event__process_mmap (tool=0x7fffffffde10, event=0x7ffff7f87e38, sample=
    0x7fffffffd2a0, machine=0x0) at util/event.c:656
3  0x00000000004733e0 in perf_session_deliver_event (session=0x91aca0, event=0x7ffff7f87e38, sample=
    0x7fffffffd2a0, tool=0x7fffffffde10, file_offset=7736) at util/session.c:979
...

The MMAP events in this case already contain the full path to the
module. No need to require it for the report path to.

Signed-off-by: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1341241977-71535-1-git-send-email-dsahern@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/map.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 35ae56864e4f..a1f4e3669142 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -669,25 +669,26 @@ struct machine *machines__find(struct rb_root *self, pid_t pid)
 struct machine *machines__findnew(struct rb_root *self, pid_t pid)
 {
 	char path[PATH_MAX];
-	const char *root_dir;
+	const char *root_dir = "";
 	struct machine *machine = machines__find(self, pid);
 
-	if (!machine || machine->pid != pid) {
-		if (pid == HOST_KERNEL_ID || pid == DEFAULT_GUEST_KERNEL_ID)
-			root_dir = "";
-		else {
-			if (!symbol_conf.guestmount)
-				goto out;
-			sprintf(path, "%s/%d", symbol_conf.guestmount, pid);
-			if (access(path, R_OK)) {
-				pr_err("Can't access file %s\n", path);
-				goto out;
-			}
-			root_dir = path;
+	if (machine && (machine->pid == pid))
+		goto out;
+
+	if ((pid != HOST_KERNEL_ID) &&
+	    (pid != DEFAULT_GUEST_KERNEL_ID) &&
+	    (symbol_conf.guestmount)) {
+		sprintf(path, "%s/%d", symbol_conf.guestmount, pid);
+		if (access(path, R_OK)) {
+			pr_err("Can't access file %s\n", path);
+			machine = NULL;
+			goto out;
 		}
-		machine = machines__add(self, pid, root_dir);
+		root_dir = path;
 	}
 
+	machine = machines__add(self, pid, root_dir);
+
 out:
 	return machine;
 }
-- 
cgit v1.2.3


From c98d5d9444732a032bc55d1a496bfa8439da9199 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Mon, 4 Jun 2012 00:56:40 -0400
Subject: tools/power: turbostat v2 - re-write for efficiency

Measuring large profoundly-idle configurations
requires turbostat to be more lightweight.
Otherwise, the operation of turbostat itself
can interfere with the measurements.

This re-write makes turbostat topology aware.
Hardware is accessed in "topology order".
Redundant hardware accesses are deleted.
Redundant output is deleted.
Also, output is buffered and
local RDTSC use replaces remote MSR access for TSC.

From a feature point of view, the output
looks different since redundant figures are absent.
Also, there are now -c and -p options -- to restrict
output to the 1st thread in each core, and the 1st
thread in each package, respectively.  This is helpful
to reduce output on big systems, where more detail
than the "-s" system summary is desired.
Finally, periodic mode output is now on stdout, not stderr.

Turbostat v2 is also slightly more robust in
handling run-time CPU online/offline events,
as it now checks the actual map of on-line cpus rather
than just the total number of on-line cpus.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/Makefile    |    1 +
 tools/power/x86/turbostat/turbostat.8 |   77 +-
 tools/power/x86/turbostat/turbostat.c | 1329 ++++++++++++++++++++-------------
 3 files changed, 868 insertions(+), 539 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile
index fd8e1f1297aa..f85649554191 100644
--- a/tools/power/x86/turbostat/Makefile
+++ b/tools/power/x86/turbostat/Makefile
@@ -1,4 +1,5 @@
 turbostat : turbostat.c
+CFLAGS +=	-Wall
 
 clean :
 	rm -f turbostat
diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index adf175f61496..74e44507dfe9 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -27,7 +27,11 @@ supports an "invariant" TSC, plus the APERF and MPERF MSRs.
 on processors that additionally support C-state residency counters.
 
 .SS Options
-The \fB-s\fP option prints only a 1-line summary for each sample interval.
+The \fB-s\fP option limits output to a 1-line system summary for each interval.
+.PP
+The \fB-c\fP option limits output to the 1st thread in each core.
+.PP
+The \fB-p\fP option limits output to the 1st thread in each package.
 .PP
 The \fB-v\fP option increases verbosity.
 .PP
@@ -65,19 +69,19 @@ Subsequent rows show per-CPU statistics.
 .nf
 [root@x980]# ./turbostat
 cor CPU    %c0  GHz  TSC    %c1    %c3    %c6   %pc3   %pc6
-          0.60 1.63 3.38   2.91   0.00  96.49   0.00  76.64
-  0   0   0.59 1.62 3.38   4.51   0.00  94.90   0.00  76.64
-  0   6   1.13 1.64 3.38   3.97   0.00  94.90   0.00  76.64
-  1   2   0.08 1.62 3.38   0.07   0.00  99.85   0.00  76.64
-  1   8   0.03 1.62 3.38   0.12   0.00  99.85   0.00  76.64
-  2   4   0.01 1.62 3.38   0.06   0.00  99.93   0.00  76.64
-  2  10   0.04 1.62 3.38   0.02   0.00  99.93   0.00  76.64
-  8   1   2.85 1.62 3.38  11.71   0.00  85.44   0.00  76.64
-  8   7   1.98 1.62 3.38  12.58   0.00  85.44   0.00  76.64
-  9   3   0.36 1.62 3.38   0.71   0.00  98.93   0.00  76.64
-  9   9   0.09 1.62 3.38   0.98   0.00  98.93   0.00  76.64
- 10   5   0.03 1.62 3.38   0.09   0.00  99.87   0.00  76.64
- 10  11   0.07 1.62 3.38   0.06   0.00  99.87   0.00  76.64
+          0.09 1.62 3.38   1.83   0.32  97.76   1.26  83.61
+  0   0   0.15 1.62 3.38  10.23   0.05  89.56   1.26  83.61
+  0   6   0.05 1.62 3.38  10.34
+  1   2   0.03 1.62 3.38   0.07   0.05  99.86
+  1   8   0.03 1.62 3.38   0.06
+  2   4   0.21 1.62 3.38   0.10   1.49  98.21
+  2  10   0.02 1.62 3.38   0.29
+  8   1   0.04 1.62 3.38   0.04   0.08  99.84
+  8   7   0.01 1.62 3.38   0.06
+  9   3   0.53 1.62 3.38   0.10   0.20  99.17
+  9   9   0.02 1.62 3.38   0.60
+ 10   5   0.01 1.62 3.38   0.02   0.04  99.92
+ 10  11   0.02 1.62 3.38   0.02
 .fi
 .SH SUMMARY EXAMPLE
 The "-s" option prints the column headers just once,
@@ -86,9 +90,10 @@ and then the one line system summary for each sample interval.
 .nf
 [root@x980]# ./turbostat -s
    %c0  GHz  TSC    %c1    %c3    %c6   %pc3   %pc6
-  0.61 1.89 3.38   5.95   0.00  93.44   0.00  66.33
-  0.52 1.62 3.38   6.83   0.00  92.65   0.00  61.11
-  0.62 1.92 3.38   5.47   0.00  93.91   0.00  67.31
+  0.23 1.67 3.38   2.00   0.30  97.47   1.07  82.12
+  0.10 1.62 3.38   1.87   2.25  95.77  12.02  72.60
+  0.20 1.64 3.38   1.98   0.11  97.72   0.30  83.36
+  0.11 1.70 3.38   1.86   1.81  96.22   9.71  74.90
 .fi
 .SH VERBOSE EXAMPLE
 The "-v" option adds verbosity to the output:
@@ -120,30 +125,28 @@ until ^C while the other CPUs are mostly idle:
 [root@x980 lenb]# ./turbostat cat /dev/zero > /dev/null
 ^C
 cor CPU    %c0  GHz  TSC    %c1    %c3    %c6   %pc3   %pc6
-          8.63 3.64 3.38  14.46   0.49  76.42   0.00   0.00
-  0   0   0.34 3.36 3.38  99.66   0.00   0.00   0.00   0.00
-  0   6  99.96 3.64 3.38   0.04   0.00   0.00   0.00   0.00
-  1   2   0.14 3.50 3.38   1.75   2.04  96.07   0.00   0.00
-  1   8   0.38 3.57 3.38   1.51   2.04  96.07   0.00   0.00
-  2   4   0.01 2.65 3.38   0.06   0.00  99.93   0.00   0.00
-  2  10   0.03 2.12 3.38   0.04   0.00  99.93   0.00   0.00
-  8   1   0.91 3.59 3.38  35.27   0.92  62.90   0.00   0.00
-  8   7   1.61 3.63 3.38  34.57   0.92  62.90   0.00   0.00
-  9   3   0.04 3.38 3.38   0.20   0.00  99.76   0.00   0.00
-  9   9   0.04 3.29 3.38   0.20   0.00  99.76   0.00   0.00
- 10   5   0.03 3.08 3.38   0.12   0.00  99.85   0.00   0.00
- 10  11   0.05 3.07 3.38   0.10   0.00  99.85   0.00   0.00
-4.907015 sec
-
+          8.86 3.61 3.38  15.06  31.19  44.89   0.00   0.00
+  0   0   1.46 3.22 3.38  16.84  29.48  52.22   0.00   0.00
+  0   6   0.21 3.06 3.38  18.09
+  1   2   0.53 3.33 3.38   2.80  46.40  50.27
+  1   8   0.89 3.47 3.38   2.44
+  2   4   1.36 3.43 3.38   9.04  23.71  65.89
+  2  10   0.18 2.86 3.38  10.22
+  8   1   0.04 2.87 3.38  99.96   0.01   0.00
+  8   7  99.72 3.63 3.38   0.27
+  9   3   0.31 3.21 3.38   7.64  56.55  35.50
+  9   9   0.08 2.95 3.38   7.88
+ 10   5   1.42 3.43 3.38   2.14  30.99  65.44
+ 10  11   0.16 2.88 3.38   3.40
 .fi
-Above the cycle soaker drives cpu6 up 3.6 Ghz turbo limit
+Above the cycle soaker drives cpu7 up its 3.6 Ghz turbo limit
 while the other processors are generally in various states of idle.
 
-Note that cpu0 is an HT sibling sharing core0
-with cpu6, and thus it is unable to get to an idle state
-deeper than c1 while cpu6 is busy.
+Note that cpu1 and cpu7 are HT siblings within core8.
+As cpu7 is very busy, it prevents its sibling, cpu1,
+from entering a c-state deeper than c1.
 
-Note that turbostat reports average GHz of 3.64, while
+Note that turbostat reports average GHz of 3.63, while
 the arithmetic average of the GHz column above is lower.
 This is a weighted average, where the weight is %c0.  ie. it is the total number of
 un-halted cycles elapsed per time divided by the number of CPUs.
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 16de7ad4850f..b815a12159b2 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -67,92 +67,119 @@ double bclk;
 unsigned int show_pkg;
 unsigned int show_core;
 unsigned int show_cpu;
+unsigned int show_pkg_only;
+unsigned int show_core_only;
+char *output_buffer, *outp;
 
 int aperf_mperf_unstable;
 int backwards_count;
 char *progname;
 
-int num_cpus;
-cpu_set_t *cpu_present_set, *cpu_mask;
-size_t cpu_present_setsize, cpu_mask_size;
-
-struct counters {
-	unsigned long long tsc;		/* per thread */
-	unsigned long long aperf;	/* per thread */
-	unsigned long long mperf;	/* per thread */
-	unsigned long long c1;	/* per thread (calculated) */
-	unsigned long long c3;	/* per core */
-	unsigned long long c6;	/* per core */
-	unsigned long long c7;	/* per core */
-	unsigned long long pc2;	/* per package */
-	unsigned long long pc3;	/* per package */
-	unsigned long long pc6;	/* per package */
-	unsigned long long pc7;	/* per package */
-	unsigned long long extra_msr;	/* per thread */
-	int pkg;
-	int core;
-	int cpu;
-	struct counters *next;
-};
-
-struct counters *cnt_even;
-struct counters *cnt_odd;
-struct counters *cnt_delta;
-struct counters *cnt_average;
-struct timeval tv_even;
-struct timeval tv_odd;
-struct timeval tv_delta;
-
-int mark_cpu_present(int pkg, int core, int cpu)
+cpu_set_t *cpu_present_set, *cpu_affinity_set;
+size_t cpu_present_setsize, cpu_affinity_setsize;
+
+struct thread_data {
+	unsigned long long tsc;
+	unsigned long long aperf;
+	unsigned long long mperf;
+	unsigned long long c1;	/* derived */
+	unsigned long long extra_msr;
+	unsigned int cpu_id;
+	unsigned int flags;
+#define CPU_IS_FIRST_THREAD_IN_CORE	0x2
+#define CPU_IS_FIRST_CORE_IN_PACKAGE	0x4
+} *thread_even, *thread_odd;
+
+struct core_data {
+	unsigned long long c3;
+	unsigned long long c6;
+	unsigned long long c7;
+	unsigned int core_id;
+} *core_even, *core_odd;
+
+struct pkg_data {
+	unsigned long long pc2;
+	unsigned long long pc3;
+	unsigned long long pc6;
+	unsigned long long pc7;
+	unsigned int package_id;
+} *package_even, *package_odd;
+
+#define ODD_COUNTERS thread_odd, core_odd, package_odd
+#define EVEN_COUNTERS thread_even, core_even, package_even
+
+#define GET_THREAD(thread_base, thread_no, core_no, pkg_no) \
+	(thread_base + (pkg_no) * topo.num_cores_per_pkg * \
+		topo.num_threads_per_core + \
+		(core_no) * topo.num_threads_per_core + (thread_no))
+#define GET_CORE(core_base, core_no, pkg_no) \
+	(core_base + (pkg_no) * topo.num_cores_per_pkg + (core_no))
+#define GET_PKG(pkg_base, pkg_no) (pkg_base + pkg_no)
+
+struct system_summary {
+	struct thread_data threads;
+	struct core_data cores;
+	struct pkg_data packages;
+} sum, average;
+
+
+struct topo_params {
+	int num_packages;
+	int num_cpus;
+	int num_cores;
+	int max_cpu_num;
+	int num_cores_per_pkg;
+	int num_threads_per_core;
+} topo;
+
+struct timeval tv_even, tv_odd, tv_delta;
+
+void setup_all_buffers(void);
+
+int cpu_is_not_present(int cpu)
 {
-	CPU_SET_S(cpu, cpu_present_setsize, cpu_present_set);
-	return 0;
+	return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set);
 }
-
 /*
- * cpu_mask_init(ncpus)
- *
- * allocate and clear cpu_mask
- * set cpu_mask_size
+ * run func(thread, core, package) in topology order
+ * skip non-present cpus
  */
-void cpu_mask_init(int ncpus)
+
+int for_all_cpus(int (func)(struct thread_data *, struct core_data *, struct pkg_data *),
+	struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base)
 {
-	cpu_mask = CPU_ALLOC(ncpus);
-	if (cpu_mask == NULL) {
-		perror("CPU_ALLOC");
-		exit(3);
-	}
-	cpu_mask_size = CPU_ALLOC_SIZE(ncpus);
-	CPU_ZERO_S(cpu_mask_size, cpu_mask);
+	int retval, pkg_no, core_no, thread_no;
 
-	/*
-	 * Allocate and initialize cpu_present_set
-	 */
-	cpu_present_set = CPU_ALLOC(ncpus);
-	if (cpu_present_set == NULL) {
-		perror("CPU_ALLOC");
-		exit(3);
-	}
-	cpu_present_setsize = CPU_ALLOC_SIZE(ncpus);
-	CPU_ZERO_S(cpu_present_setsize, cpu_present_set);
-	for_all_cpus(mark_cpu_present);
-}
+	for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
+		for (core_no = 0; core_no < topo.num_cores_per_pkg; ++core_no) {
+			for (thread_no = 0; thread_no <
+				topo.num_threads_per_core; ++thread_no) {
+				struct thread_data *t;
+				struct core_data *c;
+				struct pkg_data *p;
 
-void cpu_mask_uninit()
-{
-	CPU_FREE(cpu_mask);
-	cpu_mask = NULL;
-	cpu_mask_size = 0;
-	CPU_FREE(cpu_present_set);
-	cpu_present_set = NULL;
-	cpu_present_setsize = 0;
+				t = GET_THREAD(thread_base, thread_no, core_no, pkg_no);
+
+				if (cpu_is_not_present(t->cpu_id))
+					continue;
+
+				c = GET_CORE(core_base, core_no, pkg_no);
+				p = GET_PKG(pkg_base, pkg_no);
+
+				retval = func(t, c, p);
+				if (retval)
+					return retval;
+			}
+		}
+	}
+	return 0;
 }
 
 int cpu_migrate(int cpu)
 {
-	CPU_ZERO_S(cpu_mask_size, cpu_mask);
-	CPU_SET_S(cpu, cpu_mask_size, cpu_mask);
-	if (sched_setaffinity(0, cpu_mask_size, cpu_mask) == -1)
+	CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
+	CPU_SET_S(cpu, cpu_affinity_setsize, cpu_affinity_set);
+	if (sched_setaffinity(0, cpu_affinity_setsize, cpu_affinity_set) == -1)
 		return -1;
 	else
 		return 0;
@@ -181,67 +208,72 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr)
 void print_header(void)
 {
 	if (show_pkg)
-		fprintf(stderr, "pk");
+		outp += sprintf(outp, "pk");
 	if (show_pkg)
-		fprintf(stderr, " ");
+		outp += sprintf(outp, " ");
 	if (show_core)
-		fprintf(stderr, "cor");
+		outp += sprintf(outp, "cor");
 	if (show_cpu)
-		fprintf(stderr, " CPU");
+		outp += sprintf(outp, " CPU");
 	if (show_pkg || show_core || show_cpu)
-		fprintf(stderr, " ");
+		outp += sprintf(outp, " ");
 	if (do_nhm_cstates)
-		fprintf(stderr, "   %%c0");
+		outp += sprintf(outp, "   %%c0");
 	if (has_aperf)
-		fprintf(stderr, "  GHz");
-	fprintf(stderr, "  TSC");
+		outp += sprintf(outp, "  GHz");
+	outp += sprintf(outp, "  TSC");
 	if (do_nhm_cstates)
-		fprintf(stderr, "    %%c1");
+		outp += sprintf(outp, "    %%c1");
 	if (do_nhm_cstates)
-		fprintf(stderr, "    %%c3");
+		outp += sprintf(outp, "    %%c3");
 	if (do_nhm_cstates)
-		fprintf(stderr, "    %%c6");
+		outp += sprintf(outp, "    %%c6");
 	if (do_snb_cstates)
-		fprintf(stderr, "    %%c7");
+		outp += sprintf(outp, "    %%c7");
 	if (do_snb_cstates)
-		fprintf(stderr, "   %%pc2");
+		outp += sprintf(outp, "   %%pc2");
 	if (do_nhm_cstates)
-		fprintf(stderr, "   %%pc3");
+		outp += sprintf(outp, "   %%pc3");
 	if (do_nhm_cstates)
-		fprintf(stderr, "   %%pc6");
+		outp += sprintf(outp, "   %%pc6");
 	if (do_snb_cstates)
-		fprintf(stderr, "   %%pc7");
+		outp += sprintf(outp, "   %%pc7");
 	if (extra_msr_offset)
-		fprintf(stderr, "        MSR 0x%x ", extra_msr_offset);
+		outp += sprintf(outp, "        MSR 0x%x ", extra_msr_offset);
 
-	putc('\n', stderr);
+	outp += sprintf(outp, "\n");
 }
 
-void dump_cnt(struct counters *cnt)
+int dump_counters(struct thread_data *t, struct core_data *c,
+	struct pkg_data *p)
 {
-	if (!cnt)
-		return;
-	if (cnt->pkg) fprintf(stderr, "package: %d ", cnt->pkg);
-	if (cnt->core) fprintf(stderr, "core:: %d ", cnt->core);
-	if (cnt->cpu) fprintf(stderr, "CPU: %d ", cnt->cpu);
-	if (cnt->tsc) fprintf(stderr, "TSC: %016llX\n", cnt->tsc);
-	if (cnt->c3) fprintf(stderr, "c3: %016llX\n", cnt->c3);
-	if (cnt->c6) fprintf(stderr, "c6: %016llX\n", cnt->c6);
-	if (cnt->c7) fprintf(stderr, "c7: %016llX\n", cnt->c7);
-	if (cnt->aperf) fprintf(stderr, "aperf: %016llX\n", cnt->aperf);
-	if (cnt->pc2) fprintf(stderr, "pc2: %016llX\n", cnt->pc2);
-	if (cnt->pc3) fprintf(stderr, "pc3: %016llX\n", cnt->pc3);
-	if (cnt->pc6) fprintf(stderr, "pc6: %016llX\n", cnt->pc6);
-	if (cnt->pc7) fprintf(stderr, "pc7: %016llX\n", cnt->pc7);
-	if (cnt->extra_msr) fprintf(stderr, "msr0x%x: %016llX\n", extra_msr_offset, cnt->extra_msr);
-}
+	fprintf(stderr, "t %p, c %p, p %p\n", t, c, p);
+
+	if (t) {
+		fprintf(stderr, "CPU: %d flags 0x%x\n", t->cpu_id, t->flags);
+		fprintf(stderr, "TSC: %016llX\n", t->tsc);
+		fprintf(stderr, "aperf: %016llX\n", t->aperf);
+		fprintf(stderr, "mperf: %016llX\n", t->mperf);
+		fprintf(stderr, "c1: %016llX\n", t->c1);
+		fprintf(stderr, "msr0x%x: %016llX\n",
+			extra_msr_offset, t->extra_msr);
+	}
 
-void dump_list(struct counters *cnt)
-{
-	printf("dump_list 0x%p\n", cnt);
+	if (c) {
+		fprintf(stderr, "core: %d\n", c->core_id);
+		fprintf(stderr, "c3: %016llX\n", c->c3);
+		fprintf(stderr, "c6: %016llX\n", c->c6);
+		fprintf(stderr, "c7: %016llX\n", c->c7);
+	}
 
-	for (; cnt; cnt = cnt->next)
-		dump_cnt(cnt);
+	if (p) {
+		fprintf(stderr, "package: %d\n", p->package_id);
+		fprintf(stderr, "pc2: %016llX\n", p->pc2);
+		fprintf(stderr, "pc3: %016llX\n", p->pc3);
+		fprintf(stderr, "pc6: %016llX\n", p->pc6);
+		fprintf(stderr, "pc7: %016llX\n", p->pc7);
+	}
+	return 0;
 }
 
 /*
@@ -253,321 +285,385 @@ void dump_list(struct counters *cnt)
  * TSC: "TSC" 3 columns %3.2
  * percentage " %pc3" %6.2
  */
-void print_cnt(struct counters *p)
+int format_counters(struct thread_data *t, struct core_data *c,
+	struct pkg_data *p)
 {
 	double interval_float;
 
+	 /* if showing only 1st thread in core and this isn't one, bail out */
+	if (show_core_only && !(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+		return 0;
+
+	 /* if showing only 1st thread in pkg and this isn't one, bail out */
+	if (show_pkg_only && !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+		return 0;
+
 	interval_float = tv_delta.tv_sec + tv_delta.tv_usec/1000000.0;
 
-	/* topology columns, print blanks on 1st (average) line */
-	if (p == cnt_average) {
+	/* topo columns, print blanks on 1st (average) line */
+	if (t == &average.threads) {
 		if (show_pkg)
-			fprintf(stderr, "  ");
+			outp += sprintf(outp, "  ");
 		if (show_pkg && show_core)
-			fprintf(stderr, " ");
+			outp += sprintf(outp, " ");
 		if (show_core)
-			fprintf(stderr, "   ");
+			outp += sprintf(outp, "   ");
 		if (show_cpu)
-			fprintf(stderr, " " "   ");
+			outp += sprintf(outp, " " "   ");
 	} else {
-		if (show_pkg)
-			fprintf(stderr, "%2d", p->pkg);
+		if (show_pkg) {
+			if (p)
+				outp += sprintf(outp, "%2d", p->package_id);
+			else
+				outp += sprintf(outp, "  ");
+		}
 		if (show_pkg && show_core)
-			fprintf(stderr, " ");
-		if (show_core)
-			fprintf(stderr, "%3d", p->core);
+			outp += sprintf(outp, " ");
+		if (show_core) {
+			if (c)
+				outp += sprintf(outp, "%3d", c->core_id);
+			else
+				outp += sprintf(outp, "   ");
+		}
 		if (show_cpu)
-			fprintf(stderr, " %3d", p->cpu);
+			outp += sprintf(outp, " %3d", t->cpu_id);
 	}
 
 	/* %c0 */
 	if (do_nhm_cstates) {
 		if (show_pkg || show_core || show_cpu)
-			fprintf(stderr, " ");
+			outp += sprintf(outp, " ");
 		if (!skip_c0)
-			fprintf(stderr, "%6.2f", 100.0 * p->mperf/p->tsc);
+			outp += sprintf(outp, "%6.2f", 100.0 * t->mperf/t->tsc);
 		else
-			fprintf(stderr, "  ****");
+			outp += sprintf(outp, "  ****");
 	}
 
 	/* GHz */
 	if (has_aperf) {
 		if (!aperf_mperf_unstable) {
-			fprintf(stderr, " %3.2f",
-				1.0 * p->tsc / units * p->aperf /
-				p->mperf / interval_float);
+			outp += sprintf(outp, " %3.2f",
+				1.0 * t->tsc / units * t->aperf /
+				t->mperf / interval_float);
 		} else {
-			if (p->aperf > p->tsc || p->mperf > p->tsc) {
-				fprintf(stderr, " ***");
+			if (t->aperf > t->tsc || t->mperf > t->tsc) {
+				outp += sprintf(outp, " ***");
 			} else {
-				fprintf(stderr, "%3.1f*",
-					1.0 * p->tsc /
-					units * p->aperf /
-					p->mperf / interval_float);
+				outp += sprintf(outp, "%3.1f*",
+					1.0 * t->tsc /
+					units * t->aperf /
+					t->mperf / interval_float);
 			}
 		}
 	}
 
 	/* TSC */
-	fprintf(stderr, "%5.2f", 1.0 * p->tsc/units/interval_float);
+	outp += sprintf(outp, "%5.2f", 1.0 * t->tsc/units/interval_float);
 
 	if (do_nhm_cstates) {
 		if (!skip_c1)
-			fprintf(stderr, " %6.2f", 100.0 * p->c1/p->tsc);
+			outp += sprintf(outp, " %6.2f", 100.0 * t->c1/t->tsc);
 		else
-			fprintf(stderr, "  ****");
+			outp += sprintf(outp, "  ****");
 	}
+
+	/* print per-core data only for 1st thread in core */
+	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+		goto done;
+
 	if (do_nhm_cstates)
-		fprintf(stderr, " %6.2f", 100.0 * p->c3/p->tsc);
+		outp += sprintf(outp, " %6.2f", 100.0 * c->c3/t->tsc);
 	if (do_nhm_cstates)
-		fprintf(stderr, " %6.2f", 100.0 * p->c6/p->tsc);
+		outp += sprintf(outp, " %6.2f", 100.0 * c->c6/t->tsc);
 	if (do_snb_cstates)
-		fprintf(stderr, " %6.2f", 100.0 * p->c7/p->tsc);
+		outp += sprintf(outp, " %6.2f", 100.0 * c->c7/t->tsc);
+
+	/* print per-package data only for 1st core in package */
+	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+		goto done;
+
 	if (do_snb_cstates)
-		fprintf(stderr, " %6.2f", 100.0 * p->pc2/p->tsc);
+		outp += sprintf(outp, " %6.2f", 100.0 * p->pc2/t->tsc);
 	if (do_nhm_cstates)
-		fprintf(stderr, " %6.2f", 100.0 * p->pc3/p->tsc);
+		outp += sprintf(outp, " %6.2f", 100.0 * p->pc3/t->tsc);
 	if (do_nhm_cstates)
-		fprintf(stderr, " %6.2f", 100.0 * p->pc6/p->tsc);
+		outp += sprintf(outp, " %6.2f", 100.0 * p->pc6/t->tsc);
 	if (do_snb_cstates)
-		fprintf(stderr, " %6.2f", 100.0 * p->pc7/p->tsc);
+		outp += sprintf(outp, " %6.2f", 100.0 * p->pc7/t->tsc);
+done:
 	if (extra_msr_offset)
-		fprintf(stderr, "  0x%016llx", p->extra_msr);
-	putc('\n', stderr);
+		outp += sprintf(outp, "  0x%016llx", t->extra_msr);
+	outp += sprintf(outp, "\n");
+
+	return 0;
 }
 
-void print_counters(struct counters *counters)
+void flush_stdout()
+{
+	fputs(output_buffer, stdout);
+	outp = output_buffer;
+}
+void flush_stderr()
+{
+	fputs(output_buffer, stderr);
+	outp = output_buffer;
+}
+void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
-	struct counters *cnt;
 	static int printed;
 
-
 	if (!printed || !summary_only)
 		print_header();
 
-	if (num_cpus > 1)
-		print_cnt(cnt_average);
+	if (topo.num_cpus > 1)
+		format_counters(&average.threads, &average.cores,
+			&average.packages);
 
 	printed = 1;
 
 	if (summary_only)
 		return;
 
-	for (cnt = counters; cnt != NULL; cnt = cnt->next)
-		print_cnt(cnt);
-
+	for_all_cpus(format_counters, t, c, p);
 }
 
-#define SUBTRACT_COUNTER(after, before, delta) (delta = (after - before), (before > after))
+void
+delta_package(struct pkg_data *new, struct pkg_data *old)
+{
+	old->pc2 = new->pc2 - old->pc2;
+	old->pc3 = new->pc3 - old->pc3;
+	old->pc6 = new->pc6 - old->pc6;
+	old->pc7 = new->pc7 - old->pc7;
+}
 
-int compute_delta(struct counters *after,
-	struct counters *before, struct counters *delta)
+void
+delta_core(struct core_data *new, struct core_data *old)
 {
-	int errors = 0;
-	int perf_err = 0;
+	old->c3 = new->c3 - old->c3;
+	old->c6 = new->c6 - old->c6;
+	old->c7 = new->c7 - old->c7;
+}
 
-	skip_c0 = skip_c1 = 0;
+void
+delta_thread(struct thread_data *new, struct thread_data *old,
+	struct core_data *core_delta)
+{
+	old->tsc = new->tsc - old->tsc;
+
+	/* check for TSC < 1 Mcycles over interval */
+	if (old->tsc < (1000 * 1000)) {
+		fprintf(stderr, "Insanely slow TSC rate, TSC stops in idle?\n");
+		fprintf(stderr, "You can disable all c-states by booting with \"idle=poll\"\n");
+		fprintf(stderr, "or just the deep ones with \"processor.max_cstate=1\"\n");
+		exit(-3);
+	}
 
-	for ( ; after && before && delta;
-		after = after->next, before = before->next, delta = delta->next) {
-		if (before->cpu != after->cpu) {
-			printf("cpu configuration changed: %d != %d\n",
-				before->cpu, after->cpu);
-			return -1;
-		}
+	old->c1 = new->c1 - old->c1;
 
-		if (SUBTRACT_COUNTER(after->tsc, before->tsc, delta->tsc)) {
-			fprintf(stderr, "cpu%d TSC went backwards %llX to %llX\n",
-				before->cpu, before->tsc, after->tsc);
-			errors++;
-		}
-		/* check for TSC < 1 Mcycles over interval */
-		if (delta->tsc < (1000 * 1000)) {
-			fprintf(stderr, "Insanely slow TSC rate,"
-				" TSC stops in idle?\n");
-			fprintf(stderr, "You can disable all c-states"
-				" by booting with \"idle=poll\"\n");
-			fprintf(stderr, "or just the deep ones with"
-				" \"processor.max_cstate=1\"\n");
-			exit(-3);
-		}
-		if (SUBTRACT_COUNTER(after->c3, before->c3, delta->c3)) {
-			fprintf(stderr, "cpu%d c3 counter went backwards %llX to %llX\n",
-				before->cpu, before->c3, after->c3);
-			errors++;
-		}
-		if (SUBTRACT_COUNTER(after->c6, before->c6, delta->c6)) {
-			fprintf(stderr, "cpu%d c6 counter went backwards %llX to %llX\n",
-				before->cpu, before->c6, after->c6);
-			errors++;
-		}
-		if (SUBTRACT_COUNTER(after->c7, before->c7, delta->c7)) {
-			fprintf(stderr, "cpu%d c7 counter went backwards %llX to %llX\n",
-				before->cpu, before->c7, after->c7);
-			errors++;
-		}
-		if (SUBTRACT_COUNTER(after->pc2, before->pc2, delta->pc2)) {
-			fprintf(stderr, "cpu%d pc2 counter went backwards %llX to %llX\n",
-				before->cpu, before->pc2, after->pc2);
-			errors++;
-		}
-		if (SUBTRACT_COUNTER(after->pc3, before->pc3, delta->pc3)) {
-			fprintf(stderr, "cpu%d pc3 counter went backwards %llX to %llX\n",
-				before->cpu, before->pc3, after->pc3);
-			errors++;
-		}
-		if (SUBTRACT_COUNTER(after->pc6, before->pc6, delta->pc6)) {
-			fprintf(stderr, "cpu%d pc6 counter went backwards %llX to %llX\n",
-				before->cpu, before->pc6, after->pc6);
-			errors++;
-		}
-		if (SUBTRACT_COUNTER(after->pc7, before->pc7, delta->pc7)) {
-			fprintf(stderr, "cpu%d pc7 counter went backwards %llX to %llX\n",
-				before->cpu, before->pc7, after->pc7);
-			errors++;
-		}
+	if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) {
+		old->aperf = new->aperf - old->aperf;
+		old->mperf = new->mperf - old->mperf;
+	} else {
 
-		perf_err = SUBTRACT_COUNTER(after->aperf, before->aperf, delta->aperf);
-		if (perf_err) {
-			fprintf(stderr, "cpu%d aperf counter went backwards %llX to %llX\n",
-				before->cpu, before->aperf, after->aperf);
-		}
-		perf_err |= SUBTRACT_COUNTER(after->mperf, before->mperf, delta->mperf);
-		if (perf_err) {
-			fprintf(stderr, "cpu%d mperf counter went backwards %llX to %llX\n",
-				before->cpu, before->mperf, after->mperf);
-		}
-		if (perf_err) {
-			if (!aperf_mperf_unstable) {
-				fprintf(stderr, "%s: APERF or MPERF went backwards *\n", progname);
-				fprintf(stderr, "* Frequency results do not cover entire interval *\n");
-				fprintf(stderr, "* fix this by running Linux-2.6.30 or later *\n");
+		if (!aperf_mperf_unstable) {
+			fprintf(stderr, "%s: APERF or MPERF went backwards *\n", progname);
+			fprintf(stderr, "* Frequency results do not cover entire interval *\n");
+			fprintf(stderr, "* fix this by running Linux-2.6.30 or later *\n");
 
-				aperf_mperf_unstable = 1;
-			}
-			/*
-			 * mperf delta is likely a huge "positive" number
-			 * can not use it for calculating c0 time
-			 */
-			skip_c0 = 1;
-			skip_c1 = 1;
+			aperf_mperf_unstable = 1;
 		}
-
 		/*
-		 * As mperf and tsc collection are not atomic,
-		 * it is possible for mperf's non-halted cycles
-		 * to exceed TSC's all cycles: show c1 = 0% in that case.
+		 * mperf delta is likely a huge "positive" number
+		 * can not use it for calculating c0 time
 		 */
-		if (delta->mperf > delta->tsc)
-			delta->c1 = 0;
-		else /* normal case, derive c1 */
-			delta->c1 = delta->tsc - delta->mperf
-				- delta->c3 - delta->c6 - delta->c7;
+		skip_c0 = 1;
+		skip_c1 = 1;
+	}
 
-		if (delta->mperf == 0)
-			delta->mperf = 1;	/* divide by 0 protection */
 
-		/*
-		 * for "extra msr", just copy the latest w/o subtracting
-		 */
-		delta->extra_msr = after->extra_msr;
-		if (errors) {
-			fprintf(stderr, "ERROR cpu%d before:\n", before->cpu);
-			dump_cnt(before);
-			fprintf(stderr, "ERROR cpu%d after:\n", before->cpu);
-			dump_cnt(after);
-			errors = 0;
-		}
+	/*
+	 * As mperf and tsc collection are not atomic,
+	 * it is possible for mperf's non-halted cycles
+	 * to exceed TSC's all cycles: show c1 = 0% in that case.
+	 */
+	if (old->mperf > old->tsc)
+		old->c1 = 0;
+	else {
+		/* normal case, derive c1 */
+		old->c1 = old->tsc - old->mperf - core_delta->c3
+				- core_delta->c6 - core_delta->c7;
+	}
+	if (old->mperf == 0) {
+		if (verbose) fprintf(stderr, "cpu%d MPERF 0!\n", old->cpu_id);
+		old->mperf = 1;	/* divide by 0 protection */
 	}
+
+	/*
+	 * for "extra msr", just copy the latest w/o subtracting
+	 */
+	old->extra_msr = new->extra_msr;
+}
+
+int delta_cpu(struct thread_data *t, struct core_data *c,
+	struct pkg_data *p, struct thread_data *t2,
+	struct core_data *c2, struct pkg_data *p2)
+{
+	/* calculate core delta only for 1st thread in core */
+	if (t->flags & CPU_IS_FIRST_THREAD_IN_CORE)
+		delta_core(c, c2);
+
+	/* always calculate thread delta */
+	delta_thread(t, t2, c2);	/* c2 is core delta */
+
+	/* calculate package delta only for 1st core in package */
+	if (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)
+		delta_package(p, p2);
+
 	return 0;
 }
 
-void compute_average(struct counters *delta, struct counters *avg)
+void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	t->tsc = 0;
+	t->aperf = 0;
+	t->mperf = 0;
+	t->c1 = 0;
+
+	/* tells format_counters to dump all fields from this set */
+	t->flags = CPU_IS_FIRST_THREAD_IN_CORE | CPU_IS_FIRST_CORE_IN_PACKAGE;
+
+	c->c3 = 0;
+	c->c6 = 0;
+	c->c7 = 0;
+
+	p->pc2 = 0;
+	p->pc3 = 0;
+	p->pc6 = 0;
+	p->pc7 = 0;
+}
+int sum_counters(struct thread_data *t, struct core_data *c,
+	struct pkg_data *p)
 {
-	struct counters *sum;
+	average.threads.tsc += t->tsc;
+	average.threads.aperf += t->aperf;
+	average.threads.mperf += t->mperf;
+	average.threads.c1 += t->c1;
 
-	sum = calloc(1, sizeof(struct counters));
-	if (sum == NULL) {
-		perror("calloc sum");
-		exit(1);
-	}
+	/* sum per-core values only for 1st thread in core */
+	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+		return 0;
 
-	for (; delta; delta = delta->next) {
-		sum->tsc += delta->tsc;
-		sum->c1 += delta->c1;
-		sum->c3 += delta->c3;
-		sum->c6 += delta->c6;
-		sum->c7 += delta->c7;
-		sum->aperf += delta->aperf;
-		sum->mperf += delta->mperf;
-		sum->pc2 += delta->pc2;
-		sum->pc3 += delta->pc3;
-		sum->pc6 += delta->pc6;
-		sum->pc7 += delta->pc7;
-	}
-	avg->tsc = sum->tsc/num_cpus;
-	avg->c1 = sum->c1/num_cpus;
-	avg->c3 = sum->c3/num_cpus;
-	avg->c6 = sum->c6/num_cpus;
-	avg->c7 = sum->c7/num_cpus;
-	avg->aperf = sum->aperf/num_cpus;
-	avg->mperf = sum->mperf/num_cpus;
-	avg->pc2 = sum->pc2/num_cpus;
-	avg->pc3 = sum->pc3/num_cpus;
-	avg->pc6 = sum->pc6/num_cpus;
-	avg->pc7 = sum->pc7/num_cpus;
-
-	free(sum);
+	average.cores.c3 += c->c3;
+	average.cores.c6 += c->c6;
+	average.cores.c7 += c->c7;
+
+	/* sum per-pkg values only for 1st core in pkg */
+	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+		return 0;
+
+	average.packages.pc2 += p->pc2;
+	average.packages.pc3 += p->pc3;
+	average.packages.pc6 += p->pc6;
+	average.packages.pc7 += p->pc7;
+
+	return 0;
+}
+/*
+ * sum the counters for all cpus in the system
+ * compute the weighted average
+ */
+void compute_average(struct thread_data *t, struct core_data *c,
+	struct pkg_data *p)
+{
+	clear_counters(&average.threads, &average.cores, &average.packages);
+
+	for_all_cpus(sum_counters, t, c, p);
+
+	average.threads.tsc /= topo.num_cpus;
+	average.threads.aperf /= topo.num_cpus;
+	average.threads.mperf /= topo.num_cpus;
+	average.threads.c1 /= topo.num_cpus;
+
+	average.cores.c3 /= topo.num_cores;
+	average.cores.c6 /= topo.num_cores;
+	average.cores.c7 /= topo.num_cores;
+
+	average.packages.pc2 /= topo.num_packages;
+	average.packages.pc3 /= topo.num_packages;
+	average.packages.pc6 /= topo.num_packages;
+	average.packages.pc7 /= topo.num_packages;
 }
 
-int get_counters(struct counters *cnt)
+static unsigned long long rdtsc(void)
 {
-	for ( ; cnt; cnt = cnt->next) {
+	unsigned int low, high;
 
-		if (cpu_migrate(cnt->cpu))
-			return -1;
+	asm volatile("rdtsc" : "=a" (low), "=d" (high));
 
-		if (get_msr(cnt->cpu, MSR_TSC, &cnt->tsc))
-			return -1;
+	return low | ((unsigned long long)high) << 32;
+}
 
-		if (has_aperf) {
-			if (get_msr(cnt->cpu, MSR_APERF, &cnt->aperf))
-				return -1;
-			if (get_msr(cnt->cpu, MSR_MPERF, &cnt->mperf))
-				return -1;
-		}
 
-		if (do_nhm_cstates) {
-			if (get_msr(cnt->cpu, MSR_CORE_C3_RESIDENCY, &cnt->c3))
-				return -1;
-			if (get_msr(cnt->cpu, MSR_CORE_C6_RESIDENCY, &cnt->c6))
-				return -1;
-		}
+/*
+ * get_counters(...)
+ * migrate to cpu
+ * acquire and record local counters for that cpu
+ */
+int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	int cpu = t->cpu_id;
 
-		if (do_snb_cstates)
-			if (get_msr(cnt->cpu, MSR_CORE_C7_RESIDENCY, &cnt->c7))
-				return -1;
+	if (cpu_migrate(cpu))
+		return -1;
 
-		if (do_nhm_cstates) {
-			if (get_msr(cnt->cpu, MSR_PKG_C3_RESIDENCY, &cnt->pc3))
-				return -1;
-			if (get_msr(cnt->cpu, MSR_PKG_C6_RESIDENCY, &cnt->pc6))
-				return -1;
-		}
-		if (do_snb_cstates) {
-			if (get_msr(cnt->cpu, MSR_PKG_C2_RESIDENCY, &cnt->pc2))
-				return -1;
-			if (get_msr(cnt->cpu, MSR_PKG_C7_RESIDENCY, &cnt->pc7))
-				return -1;
-		}
-		if (extra_msr_offset)
-			if (get_msr(cnt->cpu, extra_msr_offset, &cnt->extra_msr))
-				return -1;
+	t->tsc = rdtsc();	/* we are running on local CPU of interest */
+
+	if (has_aperf) {
+		if (get_msr(cpu, MSR_APERF, &t->aperf))
+			return -3;
+		if (get_msr(cpu, MSR_MPERF, &t->mperf))
+			return -4;
+	}
+
+	if (extra_msr_offset)
+		if (get_msr(cpu, extra_msr_offset, &t->extra_msr))
+			return -5;
+
+	/* collect core counters only for 1st thread in core */
+	if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+		return 0;
+
+	if (do_nhm_cstates) {
+		if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3))
+			return -6;
+		if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6))
+			return -7;
+	}
+
+	if (do_snb_cstates)
+		if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7))
+			return -8;
+
+	/* collect package counters only for 1st core in package */
+	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+		return 0;
+
+	if (do_nhm_cstates) {
+		if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3))
+			return -9;
+		if (get_msr(cpu, MSR_PKG_C6_RESIDENCY, &p->pc6))
+			return -10;
+	}
+	if (do_snb_cstates) {
+		if (get_msr(cpu, MSR_PKG_C2_RESIDENCY, &p->pc2))
+			return -11;
+		if (get_msr(cpu, MSR_PKG_C7_RESIDENCY, &p->pc7))
+			return -12;
 	}
 	return 0;
 }
 
-void print_nehalem_info(void)
+void print_verbose_header(void)
 {
 	unsigned long long msr;
 	unsigned int ratio;
@@ -615,143 +711,82 @@ void print_nehalem_info(void)
 
 }
 
-void free_counter_list(struct counters *list)
+void free_all_buffers(void)
 {
-	struct counters *p;
+	CPU_FREE(cpu_present_set);
+	cpu_present_set = NULL;
+	cpu_present_set = 0;
 
-	for (p = list; p; ) {
-		struct counters *free_me;
+	CPU_FREE(cpu_affinity_set);
+	cpu_affinity_set = NULL;
+	cpu_affinity_setsize = 0;
 
-		free_me = p;
-		p = p->next;
-		free(free_me);
-	}
-}
+	free(thread_even);
+	free(core_even);
+	free(package_even);
 
-void free_all_counters(void)
-{
-	free_counter_list(cnt_even);
-	cnt_even = NULL;
+	thread_even = NULL;
+	core_even = NULL;
+	package_even = NULL;
 
-	free_counter_list(cnt_odd);
-	cnt_odd = NULL;
+	free(thread_odd);
+	free(core_odd);
+	free(package_odd);
 
-	free_counter_list(cnt_delta);
-	cnt_delta = NULL;
+	thread_odd = NULL;
+	core_odd = NULL;
+	package_odd = NULL;
 
-	free_counter_list(cnt_average);
-	cnt_average = NULL;
+	free(output_buffer);
+	output_buffer = NULL;
+	outp = NULL;
 }
 
-void insert_counters(struct counters **list,
-	struct counters *new)
+/*
+ * cpu_is_first_sibling_in_core(cpu)
+ * return 1 if given CPU is 1st HT sibling in the core
+ */
+int cpu_is_first_sibling_in_core(int cpu)
 {
-	struct counters *prev;
-
-	/*
-	 * list was empty
-	 */
-	if (*list == NULL) {
-		new->next = *list;
-		*list = new;
-		return;
-	}
-
-	if (!summary_only)
-		show_cpu = 1;	/* there is more than one CPU */
-
-	/*
-	 * insert on front of list.
-	 * It is sorted by ascending package#, core#, cpu#
-	 */
-	if (((*list)->pkg > new->pkg) ||
-	    (((*list)->pkg == new->pkg) && ((*list)->core > new->core)) ||
-	    (((*list)->pkg == new->pkg) && ((*list)->core == new->core) && ((*list)->cpu > new->cpu))) {
-		new->next = *list;
-		*list = new;
-		return;
-	}
-
-	prev = *list;
-
-	while (prev->next && (prev->next->pkg < new->pkg)) {
-		prev = prev->next;
-		if (!summary_only)
-			show_pkg = 1;	/* there is more than 1 package */
-	}
-
-	while (prev->next && (prev->next->pkg == new->pkg)
-		&& (prev->next->core < new->core)) {
-		prev = prev->next;
-		if (!summary_only)
-			show_core = 1;	/* there is more than 1 core */
-	}
+	char path[64];
+	FILE *filep;
+	int first_cpu;
 
-	while (prev->next && (prev->next->pkg == new->pkg)
-		&& (prev->next->core == new->core)
-		&& (prev->next->cpu < new->cpu)) {
-		prev = prev->next;
+	sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpu);
+	filep = fopen(path, "r");
+	if (filep == NULL) {
+		perror(path);
+		exit(1);
 	}
-
-	/*
-	 * insert after "prev"
-	 */
-	new->next = prev->next;
-	prev->next = new;
+	fscanf(filep, "%d", &first_cpu);
+	fclose(filep);
+	return (cpu == first_cpu);
 }
 
-void alloc_new_counters(int pkg, int core, int cpu)
+/*
+ * cpu_is_first_core_in_package(cpu)
+ * return 1 if given CPU is 1st core in package
+ */
+int cpu_is_first_core_in_package(int cpu)
 {
-	struct counters *new;
-
-	if (verbose > 1)
-		printf("pkg%d core%d, cpu%d\n", pkg, core, cpu);
-
-	new = (struct counters *)calloc(1, sizeof(struct counters));
-	if (new == NULL) {
-		perror("calloc");
-		exit(1);
-	}
-	new->pkg = pkg;
-	new->core = core;
-	new->cpu = cpu;
-	insert_counters(&cnt_odd, new);
-
-	new = (struct counters *)calloc(1,
-		sizeof(struct counters));
-	if (new == NULL) {
-		perror("calloc");
-		exit(1);
-	}
-	new->pkg = pkg;
-	new->core = core;
-	new->cpu = cpu;
-	insert_counters(&cnt_even, new);
-
-	new = (struct counters *)calloc(1, sizeof(struct counters));
-	if (new == NULL) {
-		perror("calloc");
-		exit(1);
-	}
-	new->pkg = pkg;
-	new->core = core;
-	new->cpu = cpu;
-	insert_counters(&cnt_delta, new);
+	char path[64];
+	FILE *filep;
+	int first_cpu;
 
-	new = (struct counters *)calloc(1, sizeof(struct counters));
-	if (new == NULL) {
-		perror("calloc");
+	sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list", cpu);
+	filep = fopen(path, "r");
+	if (filep == NULL) {
+		perror(path);
 		exit(1);
 	}
-	new->pkg = pkg;
-	new->core = core;
-	new->cpu = cpu;
-	cnt_average = new;
+	fscanf(filep, "%d", &first_cpu);
+	fclose(filep);
+	return (cpu == first_cpu);
 }
 
 int get_physical_package_id(int cpu)
 {
-	char path[64];
+	char path[80];
 	FILE *filep;
 	int pkg;
 
@@ -768,7 +803,7 @@ int get_physical_package_id(int cpu)
 
 int get_core_id(int cpu)
 {
-	char path[64];
+	char path[80];
 	FILE *filep;
 	int core;
 
@@ -783,14 +818,87 @@ int get_core_id(int cpu)
 	return core;
 }
 
+int get_num_ht_siblings(int cpu)
+{
+	char path[80];
+	FILE *filep;
+	int sib1, sib2;
+	int matches;
+	char character;
+
+	sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpu);
+	filep = fopen(path, "r");
+	if (filep == NULL) {
+		perror(path);
+		exit(1);
+	}
+	/*
+	 * file format:
+	 * if a pair of number with a character between: 2 siblings (eg. 1-2, or 1,4)
+	 * otherwinse 1 sibling (self).
+	 */
+	matches = fscanf(filep, "%d%c%d\n", &sib1, &character, &sib2);
+
+	fclose(filep);
+
+	if (matches == 3)
+		return 2;
+	else
+		return 1;
+}
+
 /*
- * run func(pkg, core, cpu) on every cpu in /proc/stat
+ * run func(thread, core, package) in topology order
+ * skip non-present cpus
  */
 
-int for_all_cpus(void (func)(int, int, int))
+int for_all_cpus_2(int (func)(struct thread_data *, struct core_data *,
+	struct pkg_data *, struct thread_data *, struct core_data *,
+	struct pkg_data *), struct thread_data *thread_base,
+	struct core_data *core_base, struct pkg_data *pkg_base,
+	struct thread_data *thread_base2, struct core_data *core_base2,
+	struct pkg_data *pkg_base2)
+{
+	int retval, pkg_no, core_no, thread_no;
+
+	for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
+		for (core_no = 0; core_no < topo.num_cores_per_pkg; ++core_no) {
+			for (thread_no = 0; thread_no <
+				topo.num_threads_per_core; ++thread_no) {
+				struct thread_data *t, *t2;
+				struct core_data *c, *c2;
+				struct pkg_data *p, *p2;
+
+				t = GET_THREAD(thread_base, thread_no, core_no, pkg_no);
+
+				if (cpu_is_not_present(t->cpu_id))
+					continue;
+
+				t2 = GET_THREAD(thread_base2, thread_no, core_no, pkg_no);
+
+				c = GET_CORE(core_base, core_no, pkg_no);
+				c2 = GET_CORE(core_base2, core_no, pkg_no);
+
+				p = GET_PKG(pkg_base, pkg_no);
+				p2 = GET_PKG(pkg_base2, pkg_no);
+
+				retval = func(t, c, p, t2, c2, p2);
+				if (retval)
+					return retval;
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * run func(cpu) on every cpu in /proc/stat
+ * return max_cpu number
+ */
+int for_all_proc_cpus(int (func)(int))
 {
 	FILE *fp;
-	int cpu_count;
+	int cpu_num;
 	int retval;
 
 	fp = fopen(proc_stat, "r");
@@ -805,78 +913,88 @@ int for_all_cpus(void (func)(int, int, int))
 		exit(1);
 	}
 
-	for (cpu_count = 0; ; cpu_count++) {
-		int cpu;
-
-		retval = fscanf(fp, "cpu%u %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n", &cpu);
+	while (1) {
+		retval = fscanf(fp, "cpu%u %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n", &cpu_num);
 		if (retval != 1)
 			break;
 
-		func(get_physical_package_id(cpu), get_core_id(cpu), cpu);
+		retval = func(cpu_num);
+		if (retval) {
+			fclose(fp);
+			return(retval);
+		}
 	}
 	fclose(fp);
-	return cpu_count;
+	return 0;
 }
 
 void re_initialize(void)
 {
-	free_all_counters();
-	num_cpus = for_all_cpus(alloc_new_counters);
-	cpu_mask_uninit();
-	cpu_mask_init(num_cpus);
-	printf("turbostat: re-initialized with num_cpus %d\n", num_cpus);
+	free_all_buffers();
+	setup_all_buffers();
+	printf("turbostat: re-initialized with num_cpus %d\n", topo.num_cpus);
 }
 
-void dummy(int pkg, int core, int cpu) { return; }
+
 /*
- * check to see if a cpu came on-line
+ * count_cpus()
+ * remember the last one seen, it will be the max
  */
-int verify_num_cpus(void)
+int count_cpus(int cpu)
 {
-	int new_num_cpus;
+	if (topo.max_cpu_num < cpu)
+		topo.max_cpu_num = cpu;
 
-	new_num_cpus = for_all_cpus(dummy);
-
-	if (new_num_cpus != num_cpus) {
-		if (verbose)
-			printf("num_cpus was %d, is now  %d\n",
-				num_cpus, new_num_cpus);
-		return -1;
-	}
+	topo.num_cpus += 1;
+	return 0;
+}
+int mark_cpu_present(int cpu)
+{
+	CPU_SET_S(cpu, cpu_present_setsize, cpu_present_set);
 	return 0;
 }
 
 void turbostat_loop()
 {
+	int retval;
+
 restart:
-	get_counters(cnt_even);
+	retval = for_all_cpus(get_counters, EVEN_COUNTERS);
+	if (retval) {
+		re_initialize();
+		goto restart;
+	}
 	gettimeofday(&tv_even, (struct timezone *)NULL);
 
 	while (1) {
-		if (verify_num_cpus()) {
+		if (for_all_proc_cpus(cpu_is_not_present)) {
 			re_initialize();
 			goto restart;
 		}
 		sleep(interval_sec);
-		if (get_counters(cnt_odd)) {
+		retval = for_all_cpus(get_counters, ODD_COUNTERS);
+		if (retval) {
 			re_initialize();
 			goto restart;
 		}
 		gettimeofday(&tv_odd, (struct timezone *)NULL);
-		compute_delta(cnt_odd, cnt_even, cnt_delta);
 		timersub(&tv_odd, &tv_even, &tv_delta);
-		compute_average(cnt_delta, cnt_average);
-		print_counters(cnt_delta);
+		for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS);
+		compute_average(EVEN_COUNTERS);
+		format_all_counters(EVEN_COUNTERS);
+		flush_stdout();
 		sleep(interval_sec);
-		if (get_counters(cnt_even)) {
+		retval = for_all_cpus(get_counters, EVEN_COUNTERS);
+		if (retval) {
 			re_initialize();
 			goto restart;
 		}
 		gettimeofday(&tv_even, (struct timezone *)NULL);
-		compute_delta(cnt_even, cnt_odd, cnt_delta);
 		timersub(&tv_even, &tv_odd, &tv_delta);
-		compute_average(cnt_delta, cnt_average);
-		print_counters(cnt_delta);
+		for_all_cpus_2(delta_cpu, EVEN_COUNTERS, ODD_COUNTERS);
+		compute_average(ODD_COUNTERS);
+		format_all_counters(ODD_COUNTERS);
+		flush_stdout();
 	}
 }
 
@@ -1051,6 +1169,208 @@ int open_dev_cpu_msr(int dummy1)
 	return 0;
 }
 
+void topology_probe()
+{
+	int i;
+	int max_core_id = 0;
+	int max_package_id = 0;
+	int max_siblings = 0;
+	struct cpu_topology {
+		int core_id;
+		int physical_package_id;
+	} *cpus;
+
+	/* Initialize num_cpus, max_cpu_num */
+	topo.num_cpus = 0;
+	topo.max_cpu_num = 0;
+	for_all_pr