changeset 5:17c01f69db69 draft default tip

finish
author Daichi TOMA <toma@cr.ie.u-ryukyu.ac.jp>
date Mon, 23 Jul 2012 11:58:20 +0900
parents 03e644cc3366
children
files Paper/book.bib Paper/cerium.bib Paper/paper.tex Paper/pic/PPE.pdf Paper/pic/PPE.xbb Paper/pic/SPE.pdf Paper/pic/SPE.xbb Paper/pic/cell-main.pdf Paper/pic/cell-main.xbb
diffstat 9 files changed, 165 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/Paper/book.bib	Mon Jul 23 06:51:08 2012 +0900
+++ b/Paper/book.bib	Mon Jul 23 11:58:20 2012 +0900
@@ -3,7 +3,7 @@
 	author={Clay Breshears},
 	publisher={O'REILLY},
 	year={2009},
-	month={12},
+	month={December},
 	isbn={9784873114354},
 	url={http://amazon.co.jp/o/ASIN/4873114357/},
 	price={¥ 3,360},
--- a/Paper/cerium.bib	Mon Jul 23 06:51:08 2012 +0900
+++ b/Paper/cerium.bib	Mon Jul 23 11:58:20 2012 +0900
@@ -54,6 +54,38 @@
 year = 2006
 }
 
+@misc{cell_wiki,
+title = "{Cell}",
+howpublished = "{http://en.wikipedia.org/wiki/Cell\_(microprocessor)}"
+}
+
+@manual{cell-ibm,
+author = "{IBM}",
+title = "{IBM Research - Cell}",
+year = 2005
+}
+
+@article{cell-ieee,
+    author = {Gschwind, Michael and Hofstee, H. Peter and Flachs, Brian and Hopkins, Martin and Watanabe, Yukio and Yamazaki, Takeshi},
+    title = {Synergistic Processing in Cell's Multicore Architecture},
+    journal = {IEEE Micro},
+    issue_date = {March 2006},
+    volume = {26},
+    number = {2},
+    month = mar,
+    year = {2006},
+    issn = {0272-1732},
+    pages = {10--24},
+    numpages = {15},
+    url = {http://dx.doi.org/10.1109/MM.2006.41},
+    doi = {10.1109/MM.2006.41},
+    acmid = {1130803},
+    publisher = {IEEE Computer Society Press},
+    address = {Los Alamitos, CA, USA},
+    keywords = {Cell Broadband Engine, multicore architecture, synergistic processing, synergistic processing, Cell Broadband Engine, multicore architecture},
+} 
+
+
 @manual{cell_sdk,
 author = "{International Business Machines Corporation}",
 title = "{Software Development Kit for Multicore Acceleration Version 3.1}",
@@ -291,3 +323,21 @@
 	month = "Sep",
 	year = 2011
 }
+
+@inproceedings{2006:CMC,
+    author = {Gschwind, Michael},
+    title = {Chip multiprocessing and the cell broadband engine},
+    booktitle = {Proceedings of the 3rd conference on Computing frontiers},
+    series = {CF '06},
+    year = {2006},
+    isbn = {1-59593-302-6},
+    location = {Ischia, Italy},
+    pages = {1--8},
+    numpages = {8},
+    url = {http://doi.acm.org/10.1145/1128022.1128023},
+    doi = {10.1145/1128022.1128023},
+    acmid = {1128023},
+    publisher = {ACM},
+    address = {New York, NY, USA},
+    keywords = {cell broadband engine, chip multiprocessing, compute-transfer parallelism (CTP), heterogeneous chip multiprocessor, memory-level parallelism (MLP)},
+} 
--- a/Paper/paper.tex	Mon Jul 23 06:51:08 2012 +0900
+++ b/Paper/paper.tex	Mon Jul 23 11:58:20 2012 +0900
@@ -1,4 +1,4 @@
-\documentclass[twocolumn,twoside,9.5pt]{article}
+\documentclass[twocolumn,twoside,11pt]{article}
 \usepackage[dvipdfmx]{graphicx}
 \usepackage{url}
 \usepackage{picins}
@@ -8,13 +8,13 @@
 \rhead{}
 \cfoot{}
 
-\setlength{\topmargin}{-1in \addtolength{\topmargin}{15mm}}
+\setlength{\topmargin}{-1in \addtolength{\topmargin}{20mm}}
 \setlength{\headheight}{0mm}
 \setlength{\headsep}{5mm}
-\setlength{\oddsidemargin}{-1in \addtolength{\oddsidemargin}{15mm}}
-\setlength{\evensidemargin}{-1in \addtolength{\evensidemargin}{15mm}}
-\setlength{\textwidth}{181mm}
-\setlength{\textheight}{261mm}
+\setlength{\oddsidemargin}{-1in \addtolength{\oddsidemargin}{20mm}}
+\setlength{\evensidemargin}{-1in \addtolength{\evensidemargin}{20mm}}
+\setlength{\textwidth}{171mm}
+\setlength{\textheight}{256mm}
 \setlength{\footskip}{0mm}
 \pagestyle{empty}
 
@@ -78,6 +78,87 @@
   \label{fig:cell_arch}
 \end{figure}
 
+The Cell processor marries the SPEs and the PPE via EIB to give access,
+via fully cache coherent DMA (direct memory access), to both main memory and to other external data storage. 
+To make the best of EIB, and to overlap computation and data transfer,
+each of the nine processing elements (PPE and SPEs) is equipped with a DMA engine.
+Since the SPE's load/store instructions can only access its own local memory,
+each SPE entirely depends on DMAs to transfer data to and from the main memory and other SPEs' local memories.
+A DMA operation can transfer either a single block area of size up to 16KB, or a list of 2 to 2048 such blocks.
+One of the major design decisions in the architecture of Cell is the use of DMAs as a central means of intra-chip data transfer,
+with a view to enabling maximal asynchrony and concurrency in data processing inside a chip\cite{2006:CMC}.
+
+The PPE, which is capable of running a conventional operating system, 
+has control over the SPEs and can start, stop, interrupt, and schedule processes running on the SPEs.
+To this end the PPE has additional instructions relating to control of the SPEs. 
+Unlike SPEs, the PPE can read and write the main memory and the local memories of SPEs through the standard load/store instructions.
+Despite having Turing complete architectures, 
+the SPEs are not fully autonomous and require the PPE to prime them before they can do any useful work.
+Though most of the "horsepower" of the system comes from the synergistic processing elements,
+the use of DMA as a method of data transfer and the limited local memory footprint of each SPE pose a major challenge
+to software developers who wish to make the most of this horsepower,
+demanding careful hand-tuning of programs to extract maximal performance from this CPU.
+
+The PPE and bus architecture includes various modes of operation giving different levels of memory protection,
+allowing areas of memory to be protected from access by specific processes running on the SPEs or the PPE.
+
+Both the PPE and SPE are RISC architectures with a fixed-width 32-bit instruction format.
+The PPE contains a 64-bit general purpose register set (GPR), a 64-bit floating point register set (FPR),
+and a 128-bit Altivec register set. The SPE contains 128-bit registers only.
+These can be used for scalar data types ranging from 8-bits to 128-bits 
+in size or for SIMD computations on a variety of integer and floating point formats.
+System memory addresses for both the PPE and SPE are expressed as 64-bit values
+for a theoretic address range of 264 bytes (16 exabytes or 16,777,216 terabytes).
+In practice, not all of these bits are implemented in hardware.
+Local store addresses internal to the SPU processor are expressed as a 32-bit word.
+In documentation relating to Cell a word is always taken to mean 32 bits, a doubleword means 64 bits, and a quadword means 128 bits.
+
+
+\subsubsection{Power Processor Element (PPE)}
+The PPE(Figure \ref{fig:ppe}) is the Power Architecture based, 
+two-way multithreaded core acting as the controller for the eight SPEs,
+which handle most of the computational workload. The PPE will work 
+with conventional operating systems due to its similarity to other 64-bit PowerPC processors, 
+while the SPEs are designed for vectorized floating point code execution. 
+The PPE contains a 64 KiB level 1 cache (32 KiB instruction and a 32 KiB data) and a 512 KiB Level 2 cache. 
+The size of a cache line is 128 bytes.
+Each PPE can complete two double precision operations per clock cycle using a scalar-fused multiply-add instruction,
+which translates to 6.4 GFLOPS at 3.2 GHz;
+or eight single precision operations per clock cycle with a vector fused-multiply-add instruction,
+which translates to 25.6 GFLOPS at 3.2 GHz.
+
+\begin{figure}[htb]
+  \begin{center}
+    \includegraphics[scale=0.4]{./pic/PPE.pdf}
+  \end{center}
+  \caption{PPE (Power Processor Element)}
+  \label{fig:ppe}
+\end{figure}
+
+\subsubsection{Synergistic Processing Elements (SPE)}
+Each SPE(Figure \ref{fig:ppe}) is composed of a "Synergistic Processing Unit", SPU, and a "Memory Flow Controller", MFC (DMA, MMU, and bus interface)\cite{cell-ibm}.
+An SPE is a RISC processor with 128-bit SIMD organization\cite{cell-ieee} for single and double precision instructions.
+With the current generation of the Cell, each SPE contains a 256 KiB embedded SRAM for instruction and data,
+called "Local Storage" (not to be mistaken for "Local Memory" in Sony's documents that refer to the VRAM) 
+which is visible to the PPE and can be addressed directly by software. Each SPE can support up to 4 GiB of local store memory. 
+The local store does not operate like a conventional CPU cache since it is neither transparent 
+to software nor does it contain hardware structures that predict which data to load. The SPEs contain a 128-bit,
+128-entry register file and measures 14.5 mm2 on a 90 nm process.
+An SPE can operate on sixteen 8-bit integers, eight 16-bit integers, four 32-bit integers, 
+or four single-precision floating-point numbers in a single clock cycle, as well as a memory operation. 
+Note that the SPU cannot directly access system memory; 
+the 64-bit virtual memory addresses formed by the SPU must be passed from the SPU 
+to the SPE memory flow controller (MFC) to set up a DMA operation within the system address space.
+At 3.2 GHz, each SPE gives a theoretical 25.6 GFLOPS of single precision performance.
+
+\begin{figure}[htb]
+  \begin{center}
+    \includegraphics[scale=0.5]{./pic/SPE.pdf}
+  \end{center}
+  \caption{SPE (Synergistic Processing Element)}
+  \label{fig:spe}
+\end{figure}
+
 % Cell の説明いれる
 
 % \subsection{Mailbox}
@@ -133,11 +214,10 @@
 \end{small}
 
 
-\begin{tiny}
 \begin{table}[h]
 \caption{Benchmark}
 \label{table:benchmark}
-\small
+{\scriptsize
 \begin{tabular}[t]{c||r|r|r}
 \hline
 & Word Count & Sort & Prime Counter\\
@@ -154,9 +234,8 @@
 \hline
 24 CPU (Xeon)& 40 ms & 100 ms & 31 ms\\
 \hline
-\end{tabular}
+\end{tabular}}
 \end{table}
-\end{tiny}
 
 % Word Count 	354 / 70 = 5.0571
 % Sort		846 / 163 = 5.1901
@@ -180,7 +259,7 @@
 In addition, Cerium Task Manager has many type of task, is a drawback of such description.
 This can be solved by the system description the dependency of the task rather than on the user side.
 
-\nocite{cell_abi, opencl, clay200912}
+\nocite{cell_abi, opencl, clay200912, cell_wiki, cell_cpp, cell_sdk, libspe2}
 % \nocite{yutaka:2010a, cell_abi, cell_cpp, cell_sdk, libspe2, ydl, clay200912, fix200609}
 \bibliographystyle{junsrt}
 \bibliography{cerium.bib,book.bib}
Binary file Paper/pic/PPE.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Paper/pic/PPE.xbb	Mon Jul 23 11:58:20 2012 +0900
@@ -0,0 +1,8 @@
+%%Title: ./PPE.pdf
+%%Creator: extractbb 20110311
+%%BoundingBox: 0 0 453 454
+%%HiResBoundingBox: 0.000000 0.000000 453.000000 454.000000
+%%PDFVersion: 1.3
+%%Pages: 1
+%%CreationDate: Mon Jul 23 08:57:46 2012
+
Binary file Paper/pic/SPE.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Paper/pic/SPE.xbb	Mon Jul 23 11:58:20 2012 +0900
@@ -0,0 +1,8 @@
+%%Title: ./SPE.pdf
+%%Creator: extractbb 20110311
+%%BoundingBox: 0 0 380 340
+%%HiResBoundingBox: 0.000000 0.000000 380.000000 340.000000
+%%PDFVersion: 1.4
+%%Pages: 1
+%%CreationDate: Mon Jul 23 08:57:52 2012
+
Binary file Paper/pic/cell-main.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Paper/pic/cell-main.xbb	Mon Jul 23 11:58:20 2012 +0900
@@ -0,0 +1,8 @@
+%%Title: ./cell-main.pdf
+%%Creator: extractbb 20110311
+%%BoundingBox: 0 0 423 300
+%%HiResBoundingBox: 0.000000 0.000000 423.000000 300.000000
+%%PDFVersion: 1.3
+%%Pages: 1
+%%CreationDate: Mon Jul 23 06:49:15 2012
+