/* <![CDATA[ */
function get_sym_list(){return [["Label","xlbl",[["app:osal",1782],["cha:100",932],["cha:Selftest",1722],["cha:fcoe",1119],["cha:hwinit",416],["cha:int",529],["cha:iscsi",988],["cha:l2",814],["cha:ll2",1494],["cha:mfw",642],["cha:overview",245],["cha:ptp",1750],["cha:rdma",1215],["cha:reg",375],["cha:sriov",1567],["ex:CMT1",948],["fig:bars",387],["fig:iwarp_sm",1413],["sec:100int",943],["sec:cminfo",1304],["sec:init-Zipped and Binary firmware",472],["sec:init-de-init",502],["sec:init-init",428],["sec:iwarp_teardown",1407],["sec:l2-class",873],["sec:l2-start",825],["sec:mfw-dcbx",741],["sec:mfw-link",696],["sec:mfw-protocols",758],["sec:osal",332],["sec:overview-api",255],["sec:sb-flow",607],["sec:sriov-hw-channel",1628],["sec:sriov-tlv",1610],["ssec:sb-init",549]]]];} /* ]]> */1%----------------------------------------------------------------------------------------
2%	PACKAGES AND OTHER DOCUMENT CONFIGURATIONS
3%----------------------------------------------------------------------------------------
4
5\documentclass[11pt,fleqn,hidelinks,oneside]{book} % Default font size and left-justified equations
6\usepackage[nottoc,notlot,notlof]{tocbibind}
7\makeindex % Tells LaTeX to create the files required for indexing
8%----------------------------------------------------------------------------------------
9
10% Create a command to cleanly insert a snippet with the style above anywhere in the document
11\newcommand{\insertcode}[2]{\begin{itemize}\item[]\lstinputlisting[caption=#2,label=#1,style=Style1,float=h!]{#1}\end{itemize}} % The first argument is the script location/filename and the second is a caption for the listing
12
13\newcommand{\myref}[1]
14	{\textcolor{blue}{[\ref{#1}]}}
15
16\newcommand{\myindex}[1]
17	{\index{#1@\texttt{#1}}#1}
18
19\newcommand{\ChapterFuncs}{}
20
21%Fpr some reason, this doesn't work inside \item so we can't have this as part of \myfunc
22\newcommand{\silentfunc}[1]
23{\expandafter\def\expandafter\ChapterFuncs\expandafter{\ChapterFuncs { } \insertcode{snippets/#1_generated.h}{}}}
24
25\newcommand{\myfunc}[2]
26{\index{ZZZ@API Function!ecore\_#1@\texttt{ecore\_#1}}%
27\silentfunc{#2}\texttt{ecore\_#1()}}
28
29\newenvironment{bottompar}{\par\vspace*{\fill}}{\clearpage}
30
31\newcommand{\SpillChapterFuncs}%
32%{\begin{bottompar}
33{%
34%\texttt{\textbf{\\API functions in this chapter: \\}}%
35\section{API functions discussed in this chapter}
36\ChapterFuncs{}%
37%\end{bottompar}%
38\renewcommand{\ChapterFuncs}{}}
39
40%----------------------------------------------------------------------------------------
41
42\input{structure} % Insert the commands.tex file which contains the majority of the structure behind the template
43
44%\lstset{belowskip=-20pt plus 2pt}
45\lstset{belowskip=\smallskipamount,aboveskip=\smallskipamount,boxpos=h!,float=h!}
46\makeatletter
47\setlength{\@fptop}{5pt}
48\makeatother
49
50
51\usepackage{hyperref}
52\usepackage{verbatim}
53
54%Macros
55\newcommand{\mlist}[1]{\begin{itemize}{#1}\end{itemize}}
56\newcommand{\mlisti}[2]{\item {\textcolor{red}{#1} -- #2}}
57
58\long\def\greybox#1{%
59    \newbox\contentbox%
60    \newbox\bkgdbox%
61    \setbox\contentbox\hbox to \hsize{%
62        \vtop{
63            \kern\columnsep
64            \hbox to \hsize{%
65                \kern\columnsep%
67                \setlength{\textwidth}{\hsize}%
68                \vbox{
69                    \parskip=\baselineskip
70                    \parindent=0bp
71                    #1
72                }%
73                \kern\columnsep%
74            }%
75            \kern\columnsep%
76        }%
77    }%
78    \setbox\bkgdbox\vbox{
79        \pdfliteral{0.75 0.75 0.75 rg}
80        \hrule width  \wd\contentbox %
81               height \ht\contentbox %
82               depth  \dp\contentbox
83        \pdfliteral{0 0 0 rg}
84    }%
85    \wd\bkgdbox=0bp%
86    \vbox{\hbox to \hsize{\box\bkgdbox\box\contentbox}}%
87    \vskip\baselineskip%
88}
89
90\newcommand{\greycom}[2]{\greybox{\textcolor{red}{#1} -- #2}}
91
92
93\global \mdfdefinestyle{MyMdStyle}{%
94	linecolor=black, linewidth=1,%
95	outerlinecolor=red,outerlinewidth=2pt,%
96	roundcorner=5pt,backgroundcolor=brown!10,nobreak=true}
97
98\newenvironment{warning}
99	{\par\begin{mdframed}[style=MyMdStyle] \begin{Warning}}
100	{\end{Warning}\end{mdframed}\vspace{5pt}\par}
101
102\newcommand{\HRule}{\rule{\linewidth}{0.5mm}}
103\newenvironment{TBD}
104	{\par\vspace{3pt}\begin{mdframed}[style=MyMdStyle,outerlinecolor=blue,%
105									  backgroundcolor=blue!10]%
106		\begin{question}}
107  {\end{question}\end{mdframed}\par}
108
109\newenvironment{NOTICE}
110  {\par\begin{mdframed}[style=MyMdStyle,outerlinecolor=black,%
111  						linecolor=black, outerlinewidth=1.5pt]%
112    \begin{itemize}{}{\leftmargin=1cm
113                   \labelwidth=\leftmargin}\item[\Large\Info]}
114  {\end{itemize}\end{mdframed}\par}
115
116\newenvironment{REMINDER}
117	{\par\begin{mdframed}[style=MyMdStyle,outerlinecolor=blue,%
118  						  linecolor=blue, outerlinewidth=2pt]%
119		\begin{reminder}}
120  {\end{reminder}\end{mdframed}\par}
121
122\bibliographystyle{plain}
123
124\begin{document}
125
126\begin{titlepage}
127\begin{center}
128
129% Upper part of the page. The '~' is needed because \\
130% only works if a paragraph has started.
131\includegraphics[width=0.5\textwidth]{./qlogic-logo}~\\[3cm]
132
133% Title
134\HRule \\[0.4cm]
135{ \huge \bfseries E4 ecore \\[0.4cm] }
136
137\HRule \\[1.5cm]
138
139\begin{minipage}{0.4\textwidth}
140\begin{flushleft} \large
141\emph{Authors:}\\
142Ariel \textsc{Elior} \\
143Michal \textsc{Kalderon} \\
144Yuval \textsc{Mintz} \\
145Merav \textsc{Sicron} \\
146Tomer \textsc{Tayar} \\
147Sudarsana Reddy \textsc{Kalluru} \\
148\end{flushleft}
149\end{minipage}
150\begin{minipage}{0.4\textwidth}
151\begin{flushright} \large
152\emph{Version:} \\
1530.0.10
154\end{flushright}
155\end{minipage}
156
157\vfill
158
159% Bottom of the page
160{\large \today}
161
162\end{center}
163\end{titlepage}
164
166\chapterimage{qlogic-full-36}
168
169\cleardoublepage % Forces the first chapter to start on an odd page so it's on the right
170
172
173
174%----------------------------------------------------------------------------------------
175%	Real Content
176%----------------------------------------------------------------------------------------
177\chapterimage{pictures/qlogic-full-36.jpg}
178\chapter{Introduction}
179By definition, a driver is the entity which allows an OS to drive a hardware device.
180As such the driver contains both device-specific parts and OS-specific parts.
181The Everest architecture, with programmable fastpath processors (Storms), host-based device-dedicated memory (ILT), and minimal on-chip management presents a device which requires a driver with significant portions of device-specific code.
182
183Drivers will be implemented for Everest 4 devices in many OSs (linux, windows, freebsd, solaris, esx, aix, hpux…).
184Implementing the device-specific code again and again in each OS is both wasteful and difficult to maintain.
185For this purpose the ecore was conceived.
186A large mass of code for operating and interacting with the Everest 4 device, to be incorporated into and used by OS drivers.
187
188In the abstract, the ecore is a layer between the HW/FW and the OS.
189It is device-specific and OS-agnostic. When ecore code requires OS services (e.g. memory allocation, pci configuration space access, etc.) it calls an abstract OS function for that purpose. These are implemented in OS-specific layers.
190Ecore flows may be driven by the HW (e.g. by an interrupt) or by the OS specific portion of the driver (e.g. driver load/unload).
191
192\begin{itemize}
193
194	\item Slowpath flows tend to reside largely in ecore and less so in OS specific layers. As much of the functionality as possible is placed in the ecore to leverage it across multiple platforms. \\
195
196	\item Fastpath flows tend to be in the OS specific layer as too much layering and abstraction is out of place in fastpath.
197However, the fastpath would usually be set up by ecore flows, for example the address where transmission flow should write a doorbell to the BAR is determined by the ecore at init phase and this address is supplied by ecore to the OS specific layer. \\
198
199\end{itemize}
200
201Different drivers in the same OS may have the ecore within them, and may use it for similar or different purposes:
202
203\begin{exampleT}
204	In linux there will be an ethernet driver, an fcoe driver, an iscsi driver, a roce driver and also a slim driver for the diag utility.
205	All of these may exists in the same system.
206	All of these will have an ecore instance incorporated in them.
207	Either one of the drivers might use the ecore to initialize the device, or the sections of the device pertaining to that driver’s operation.
208	A storage driver may use the ecore for storage specific purposes, such as the initialization and allocation of task context.
209\end{exampleT}
210
211The ecore is not a driver in its own capacity, but only code which is used by other drivers. Thus, separate drivers, including separate instances of the same driver within an OS, have separate instances of the ecore within them, which are concurrently active.
212
213\section{scope}
214This document strives to define and detail what is the ecore.
215The first parts of the document deal with the concept of the ecore, and its place in the software layers between the device and the OS.
216The rest of the document deals with the content of the ecore.
217This document does not deal with the needs and use cases of any specific OS or tool, but only with the common ground which is the ecore.
218
219The document sometimes delves in-depth into the inner-workings of the ecore; Since the programmer coming to utilize the ecore might not need [or want] to know those inner workings, such a person should look into specific sections in each chapter, specifically:
220\begin{enumerate}
221	\item Chapter \ref{cha:overview}'s introduction and section \ref{sec:overview-api} for a listing of the ecore API files and their locations.
222
223	\item OS abstraction layer [\ref{sec:osal}] for functions needed to be implemented by upper-layer driver in order to support the ecore.
224
225	\item Register-access [\ref{cha:reg}], mainly for learning about PTTs which are required by various ecore API functions.
226
227	\item Initialization and De-initialization of the HW [ \ref{sec:init-init}, \ref{sec:init-de-init}].
228
229	\item Status block initialization [\ref{ssec:sb-init}] and Interrupt handling flow [\ref{sec:sb-flow}].
230
232
233	\item Protocol related initialization/de-initialization:
234	\begin{enumerate}
235		\item L2-related, see Chapter [\ref{cha:l2}].
236	\end{enumerate}
237\end{enumerate}
238
239In addition, each chapter which includes ecore API functions that can be called by the upper-layer driver lists those functions' prototypes at its end.
240
241%\bibliography{ecore}
242
243\chapterimage{qlogic-full-36}
244\chapter{Ecore interface overview}
245\label{cha:overview}
246The ecore can be found at the perforce servers under:
247\begin{center}
248	//servers/main/nx2/579xx/drivers/ecore
249\end{center}
250
251Most of the ecore consists of the \textit{inner} parts, i.e., HW-oriented implementation to which the upper-layer driver writer is oblivious.
252Above that is a concise API layer, through which the upper-layer driver should manipulate the ecore code.
253
254\section{Ecore API}
255\label{sec:overview-api}
256The Ecore API contains two types of files:
257\begin{enumerate}
258	\item Files of the format \texttt{ecore\_<module>\_api.h} -- these files are the SW API between the ecore and the upper-layer driver:
259	\begin{enumerate}
260		\item \texttt{ecore\_cxt\_api.h}.
261		\item \texttt{ecore\_dev\_api.h}.
262		\item \texttt{ecore\_fcoe\_api.h}.
263		\item \texttt{ecore\_int\_api.h}.
264		\item \texttt{ecore\_iov\_api.h}.
265		\item \texttt{ecore\_iscsi\_api.h}.
266		\item \texttt{ecore\_ll2\_api.h}.
267		\item \texttt{ecore\_roce\_api.h}.
268		\item \texttt{ecore\_sp\_api.h}.
269		\item \texttt{ecore\_vf\_api.h}.
270		\item \texttt{ecore\_mcp\_api.h}.
271	\end{enumerate}
272	\item Files of the format \texttt{ecore\_hsi\_<protocol>.h} -- these files contain the API between FW/HW and the the ecore/upper-layer driver:
273	\begin{enumerate}
274		\item \texttt{ecore\_hsi\_common.h}.
275		\item \texttt{ecore\_hsi\_eth.h}.
276		\item \texttt{ecore\_hsi\_fcoe.h}.
277		\item \texttt{ecore\_hsi\_iscsi.h}.
278		\item \texttt{ecore\_hsi\_roce.h}.
279		\item \texttt{ecore\_hsi\_tcp.h}.
280		\item \texttt{ecore\_hsi\_toe.h}.
281	\end{enumerate}
282\end{enumerate}
283Upper-layer driver should not include any other ecore header file, as the rest of the header files are internal, with the following exceptions:
284\begin{itemize}
285	\item \texttt{ecore\_chain.h} -- Networking drivers will probably want to include this to benefit from the already-implemented chain.
286	\item \texttt{ecore\_utils.h} -- Useful macros which can be used by upper-layer driver.
287	\item \texttt{ecore\_status.h} -- contains \texttt{enum \_ecore\_status\_t}. Many of the ecore return values are of this type.
288\end{itemize}
289
290\begin{warning}
291Currently \texttt{ecore.h, ecore\_proto\_if.h} should also be included by upper-layer driver; This will (hopefully) be fixed shortly.
292\end{warning}
293
294
295\section{Ecore Internal files}
296This lists the ecore files, giving each a short description:
297
298\begin{itemize}
299	\item \texttt{ecore\_attn\_values.h}
300
301	\item \texttt{ecore\_chain.h} -- Implements a cyclic chain; Used for various interfaces with the FW [Buffer-Descriptoss, Event Queues, etc.].
302
303	\item \texttt{ecore\_cxt\_api.[ch]} -- Handles the allocation, configuration and distribution of contexts to the various clients.
304
305	\item \texttt{ecore\_dbg\_fw\_funcs.[ch], ecore\_dbg\_values.h, ecore\_fw\_defs} -- Files which contain code related for various debug features ecore can provide [e.g., grcDump].
306
307	\item \texttt{ecore\_fcoe.[ch], ecore\_iscsi.[ch], ecore\_ll2.[ch], ecore\_roce.[ch]} -- files containing specific ecore code for the storage protocols.
308
309	\item \texttt{ecore\_dev.[ch]} -- Contains much of the functionality of starting/stopping the hardware. See chapter \ref{cha:hwinit}.
310
311	\item \texttt{ecore\_hw.[ch], ecore\_gtt\_reg\_addr.h, ecore\_gtt\_values.h} -- contains the functionality for register access and DMAE. See chapter \ref{cha:reg}.
312
313	\item \texttt{ecore.h} -- contains the defintion of the most \textit{elementary} structures in the ecore, the \texttt{ecore\_dev} and the \texttt{ecore\_hwfn}.
314
315	\item \texttt{ecore\_init\_defs.h, ecore\_init\_fw\_funcs.[ch], ecore\_init\_ops.[ch], \\ ecore\_init\_values.h, ecore\_rt\_defs} -- Code responsible for initialization and configuration of the HW and loading of the FW, mostly in relation with the init-tool. See chapter \ref{cha:hwinit}.
316	\begin{REMINDER}
317			Chapter \ref{cha:hwinit} doesn't really give a thorough explanation of the init tool - at most it mentions it. Do we want a section/chapter of it somewhere?
318	\end{REMINDER}
319
320	\item \texttt{ecore\_int.[ch]} -- Handles interrupts and attentions. See chapter \ref{cha:int}.
321
322	\item \texttt{ecore\_iro.h, ecore\_iro\_values.h} -- Generated FW files. Enables ecore to access [or supply to upper-layer] addresses inside the \texttt{storm}'s RAM.
323
324	\item \texttt{ecore\_mcp.[ch]} -- Contains the interface between the ecore and the MFW. See chapter \ref{cha:mfw}.
325
326	\item \texttt{ecore\_sp\_commands.[ch], ecore\_spq.[ch]} -- Contained the slowpath logic required for sending ramrods and configuring \& handling the various slowpath events.
327
328	\item \texttt{ecore\_sriov.[ch], ecore\_vf.[ch], ecore\_vfpf\_if.h} -- Contains the SRIOV implementation both from the PF and VF sides.
329\end{itemize}
330
331\section{OS abstraction Layer}
332\label{sec:osal}
333
334%\section{Driver Core}
335%As the ecore contains most of the lowlevel code operating the non-fastpath parts of the working with the HW and FW, it can be thought of as some sort of library – it contains bits of code meant to be operated from an outside source. Each OS needs to implement its own driver, calling the various functions in the ecore API in a place fitting for that OS driver flows.
336%Each OS will independently need to create a driver that incorporates the ecore, both filling the OS dependent callbacks required by the ecore to perform and supply an upper level of abstraction which best suits that OS. Notice this upper layer is sometimes also, mistakenly, referred to as ecore [e.g., bnx2c for linux drivers] but there’s an important distinction:
337%\begin{itemize}
338%	\item Ecore – shared code between ALL operating systems.
339%	\item Upper-Layer – shared code by all drivers on a single operating system.
340%\end{itemize}
341
342%It’s possible [and likely] that an operating system will break the various protocols into different sub-drivers, where each sub-driver will be designated for a specific protocol. Notice that if such separation is made, the preferred implementation is that the OS will implement a ‘core’ driver consisting of the Ecore and an upper-layer, and define an API through which the various protocol drivers communicate with the OS core driver\footnote{Although notice there should be no inter-dependencies between HW-functions in the ecore, so the alternative method where each contains the ecore is also feasible}.
343
344The ecore utilizes various functions which should be implemented by the upper layer. There are two main ‘types’ of functions:
345\begin{enumerate}
346	\item Basic OS-specific operations that the ecore needs in order to perform it’s work; e.g., memory allocations – the ecore needs to allocate memory for various reasons, and it needs the upper layer to supply the method by which it can do so.
347	\item Hooks by which the upper-layer can run additional OS specific code, or make decisions affecting the work of the ecore. E.g., in the SRIOV flows, the mechanism for passing messages from VF to PF is implemented in the ecore but the decision whether a request is valid or not might be OS specific – as in the case of unicast filters.
348\end{enumerate}
349
350The various functions that need to be implemented by the upper-layer can be found in Appendix \ref{app:osal} -- OSAL Documentation.
351
352
353\section{Ecore print scheme}
354The ecore utilizes several printing methods to print messages to the system logs; It requires some functions to be implemented by the upper-layer for this to work – the required documentation can be found in Appendix \ref{app:osal} -- OSAL Documentation.
355In order to support this, the verbosity mechanism contains two distinct values \myindex{\texttt{DP\_LEVEL}} and \myindex{\texttt{DP\_MODULE}} [both can be found in \texttt{ecore.h}]. Since the printing scheme in the ecore was defined with the linux limitations in mind – that is, the API [via ethtool] allowing the setting of the debug message level is only 32-bit long, both \texttt{DP\_MODULE} and \texttt{DP\_LEVEL} together contain only 32-bits.
356The \texttt{DP\_LEVEL} determines which prints will actually reach the logs based on the message urgency, defining 4 levels – verbose, info, notice and error. When level is set, all prints which are at least as urgent will be printed. Notice this means there’s a single level – e.g., you can’t have a configuration in which you’ll get all the info’ level prints, but not the notice’ level.
357The \texttt{DP\_MODULE} is relevant only when level is set to verbose, and it defines which of the verbose prints should reach system logs, based mostly on component/flow. When setting the module level, a bit mask of the requested components/flows is set.
358In order to set which prints should reach system logs, the upper layer should utilize the ecore function \myfunc{init\_dp}{init_dp} defined in \texttt{ecore\_dev.c}.
359
360\section{Compilation flags}
361The ecore project contains several optional compilation flags that if passed would affect the content compiled. A few notable flags:
362\begin{itemize}
363	\item ASIC\_ONLY -- By default, this is off'. Setting this would remove content that is relevant only for simulations of the hardware, I.e., emulations and FPGAs.
364
365	\item REAL\_ASIC\_ONLY -- By default, this is off'. Setting this would remove content that is relevant for non-productized hardware, E.g., workarounds for BigBear A0.
366
367	\item REMOVE\_DBG -- By default, this is off'. There are several structures and field in ecore which aren't functional; there sole purpose is to store interesting data for memory dumps in case of failures. Setting this would remove all such data items.
368\end{itemize}
369
370\SpillChapterFuncs
371
372
373\chapterimage{qlogic-full-36}
374\chapter{Register Access}
375\label{cha:reg}
376This section describes the ecore API for accessing registers.
377The E4 bar is a reduced BAR, i.e., it does not map the entire register address range.
378To access the entire range, windows are defined that can be configured to point to a certain address within the device and allow reading and writing of registers / memory from that address.
379There are two types of windows, \textbf{PTT} (per PF Translation Table) and \textbf{GTT} (Global Translation Table).
380
381The \textit{external BAR} is the BAR accessed by the ecore. It is divided into configurable windows which point to different areas within the device (Image \ref{fig:bars}, Internal BAR vs. External BAR, demonstrates this).
382
383\begin{figure}[ht]
384	\caption{Internal BAR vs. External BAR}
385	\centering
386	\includegraphics[width=0.8\paperwidth]{reg_access}
387	\label{fig:bars}
388\end{figure}
389
390For more details on the E4 BAR access scheme the reader is referred to the “Reduced PF BAR0 size” section of \cite{doc:PXP}. \\
391
392
393All register access should be done within the ecore layer and it is not expected for the upper layers to access registers at all.
394For this reason, there is no description here on how to find the register address and how to distinguish whether the address is mapped into a \myindex{GTT} or a \myindex{PTT}.
395However, in case a need does rise in the future, API for reading/writing is detailed below as well.
396
397Ecore requires an OSAL implementation of the macros:
398\begin{enumerate}
399	\item \myindex{REG\_RD}
400	\item \myindex{REG\_WR}
401\end{enumerate}
402These macros are a direct read / write from the BAR with the absolute address offset given.
403Implementation should add the offset to the mapped BAR address and call the appropriate OS specific API.
404
405Several ecore interface functions require a PTT. There is a pool of PTTs maintained by ecore.
406The reason there are several PTTs is to enable simultaneous access to device registers from different flows.
407The PTT is reserved per flow, and it is the responsibility of the upper layer to make sure it does not use the same PTT in flows that can run concurrently. Upper-layer requests for a PTT entry using \myfunc{ptt\_acquire}{ptt_acquire}.
408However, to avoid running out of this resource, it is also the responsibility of the upper layer not to acquire too many PTTs without releasing them. Returning a PTT entry back to the pool is done via \myfunc{ptt\_release}{ptt_release}.
409
410Using a PTT, ecore [and upper-driver] can access registers/memories using inner BAR addresses; The ecore is responsible for configuring the memory windows, and translates the inner address into an external address [i.e., one which resides on the actual BAR as seen by the host]. The register access is then made by calling \texttt{ecore\_wr} and \texttt{ecore\_rd}.
411\SpillChapterFuncs
412
413
414\chapterimage{qlogic-full-36}
415\chapter{Hardware/Firmware initialization}
416\label{cha:hwinit}
417
418\section{Basic concepts -- inner-working of the ecore}
419\begin{itemize}
420	\item \myindex{ILT} – one of the features of our device is that the memories used by various HW blocks are allocated on the host memory [as opposed to large embedded memory segment on chip]. The driver is responsible for allocating the memory needed for those HW blocks [DMA-coherent memory] and configure both the HW blocks themselves and a communal sub-block known as ILT. The ecore contains complicated code that decides exactly how much memory each such block needs, allocates it in an ‘ilt\_shadow’, and then uses that shadow to configure the ILT itself with all the allocated chunks of memory.
421
422Additional ILT documentation is at \cite{doc:ILT}.
423
424	\item \myindex{RT array} – when the ecore initializes the HW, it utilizes a common, tool-generated code known as the init-tool. Since there are quite a few values which depend upon actual setup configuration and thus must receive feedback during the initialization from the ecore, instead of adding many such hooks there’s the concept of the RunTime array – an array of values filled by the ecore prior to the init-tool run based on the complex ecore logic. The init-tool will then utilize the values in that array to configure the HW according to the correct order of configuration [i.e., writing the values set by ecore in the array in the correct place in the initialization flow where they’re required/the block that contains them is configured].
425\end{itemize}
426
427\section{Initialization}
428\label{sec:init-init}
429The functions required for initializing the HW/FW mostly reside in \texttt{ecore\_dev.[ch]}; More accurately, most of the outside API [toward the upper-layer] is in \texttt{ecore\_dev.h} – the functions themselves utilize many other ecore files.
430This section gives a brief description of the functions that need to be called, what they do, requirements, etc., in order to successfully initialize the ecore structs and load the HW/FW.
431
432\silentfunc{init_struct}
433\silentfunc{hw_prepare}
434\silentfunc{resc_alloc}
435\silentfunc{resc_setup}
436\silentfunc{hw_init}
437\begin{itemize}
438	\item \myfunc{init\_struct}{init\_struct} – After allocating and setting of zeroes of the ecore\_dev [the upper-layer responsibility], a pointer to it should be passed to this function for some early initialization of the data structure. \\
439
440	\item \myfunc{hw\_prepare}{hw_prepare} – This function serves two purposes [plus some additional inner ecore workings]:
441	\begin{enumerate}
442		\item It enables the ecore to access its BAR, doing things such as enabling the PTT pool and opening the access in the PGLUE\_B block.
443		Notice this doesn’t actually do anything to the PCI BAR itself – the upper-layer should have initialized those before calling this function, and must guarantee that its REG\_WR/RD functions actually point to valid, accessible addresses.
444		\item It learns as much as it can about system configuration from HW and SHMEM.
445	\end{enumerate}
446
447Trying to access registers except for pci-related ones prior to calling this function will fail. \\
448
449	\item \myfunc{resc\_alloc}{resc_alloc} – Allocates the various ecore-related memory, e.g., contexts, slowpath queue, SRIOV information, etc. Notice that before calling this function, each HW-function of the \texttt{ecore\_dev} should have its pf\_params’ set, as the function depends upon the protocol-specific resources for its calculations. \\
450
451	\item \myfunc{resc\_setup}{resc_setup} – Configures the various slowpath elements. Notice that since there’s no guarantee chip is alive at this point [i.e., it’s very likely the chip is reset at this point], it fills the configuration in the runtime array instead of actually writing it to chip. \\
452
453	\item \myfunc{hw\_init}{hw_init} – This function actually initializes the chip, using the init-tool and the runtime array to make the correct configuration.
454	 As part of the slowpath interrupt enablement, ecore invokes OSAL\_SLOWPATH\_IRQ\_REQ() callback for each HW function. The client implementation should setup the IRQ handlers for slowpath interrupt handling.
455	 This is required since as part of the flow the \texttt{function\_start} ramrod will be sent to FW; Once FW finishes handling it, an \myindex{EQE} [Event Queue Element] will be placed in the slowpath event queue and an interrupt will be fired. The flow is dependent on the EQE being processed.
456
457	Some interesting sub-functions of the \texttt{ecore\_hw\_init()} method, at least for debugging purposes as many possible errors can be caught there:
458	\begin{itemize}
459		\item \texttt{ecore\_get\_init\_mode()} – this creates a bitmask which will be later passed to the init-tool which describes the configured mode – Multi function vs. Single function, 40G vs. 100G etc. A wrong configuration here could explain many peculiar events later on. \\
460
461%		\item ecore\_mcp\_load\_req() – the MFW [assuming it is present] will answer with one of 3 possible answers: ENGINE, PORT or FUNCTION.
462%		The MFW is responsible for initializing the common blocks [i.e., the HW blocks shared between the 2 engines], but the driver is responsible for the rest.
463%		Each function needs to perform different initialization based on whether it’s the first to load on its engine [ENGINE], the first to load on its port [PORT] or if it’s being loaded on an already initialized port [FUNCTION]\footnote{Initialization which is common for both engines will be performed by the MFW.}.
464%	Some very basic errors can be detected here, if the function receives an unexpected answer from MFW.
465	\end{itemize}
466
467Once this function returns, the chip is initialized, FW is functional and slowpath event queues are operational.
468
469\end{itemize}
470
471\section{Zipped and Binary firmware}
472\label{sec:init-Zipped and Binary firmware}
473\begin{itemize}
474	\item \myindex{Zipped Firmware} - There are two types of firmware files generated in ecore.\\
475Non-zipped firmware [ecore\_init\_values.h and ecore\_init\_values.bin] and Zipped firmware [ecore\_init\_values\_zipped.h and
476ecore\_init\_values\_zipped.bin] files. Each type of file is generated in two formats that is a C header file and binary file,
477where each has all relevant data needed to initialize the firmware. Either of these file types can be used for firmware initialization.
478The difference is that Zipped firmware files has lot of dmae firmware data zipped which is beneficiary in reducing the code size.\\
479
480By default, the non-zipped variant is used. If ecore clients want to use zipped version of firmware then they need to have
481CONFIG\_ECORE\_ZIPPED\_FW defined/enabled by their operating system drivers to make feature operational. For unzipping the
482zipped firmware data ecore clients need to implement OSAL\_UNZIP\_DATA() as well. This OSAL is meant for unzipping the
483zipped firmware data in order to do firmware initialization.\\
484
485	\item \myindex{Binary Firmware} - As explained above there are two formats of firmware files
486generated by ecore, C header files [ecore\_init\_values.h and ecore\_init\_values\_zipped.h] and
487binary firmware files [ecore\_init\_values.bin and ecore\_init\_values\_zipped.bin]. Either of those files formats
488can be used by ecore clients to utilize firmware data. By default, ecore uses the .h files which are compiled as part of the ecore,
489but using binary firmware files has the advantage where the code size is reduced and the FW can be loaded from a file imported by
490the system.\\
491
492If ecore clients want to use firmware data from binary files then they need to have CONFIG\_ECORE\_BINARY\_FW defined/enabled by their
493operating system drivers to make feature operational. Ecore clients must store all binary firmware data from the
494file in to a void* pointer and pass that firmware data buffer pointer in ecore\_hw\_init() as an argument.
495If ecore client is not using binary firmware file or instead using firmware from regular header files then they
496should pass NULL as an argument for binary firmware data buffer in ecore\_hw\_init().
497
498
499\end{itemize}
500
501\section{De-Initialization}
502\label{sec:init-de-init}
503\silentfunc{hw_stop}
504\silentfunc{resc_free}
505\silentfunc{hw_remove}
506\begin{itemize}
507	\item \myfunc{hw\_stop}{hw_stop} – this function notifies the MFW that the HW-functions unload, stops the FW/HW for all HW-functions in the \texttt{ecore\_dev} including sending the common PF\_STOP ramrod for each HW-function, and disables the HW-functions in various HW blocks.
508	Notice that before calling this, all the protocol specifics done after initializing the HW should have already been reversed by the upper-layer [e.g., L2 VPORTs which were started by the upper layer should be stopped before calling this].
509	Following this function, it is guaranteed HW will not generate any more slowpath interrupts, so the interrupt handler can be released [and slowpath DPC context can be stopped]. \\
510
511	\item \myfunc{int\_disable\_post\_isr\_release}{ecore_int_disable_post_isr_release} – this function performs the required IRQ related cleanup post the ISR release. The function need to be called after releasing all slowpath IRQs of the device.
512
513	\item \myfunc{resc\_free}{resc_free} – Releases the memory allocated by the ecore during \texttt{ecore\_resc\_alloc()}. \\
514
515	\item \myfunc{hw\_remove}{hw_remove} – Release the memory allocated early by the ecore during \texttt{ecore\_hw\_prepare()}.
516	Following this, REG\_RD/REG\_WR are no longer operational - upper layer can disable the PCI BAR.
517\end{itemize}
518\SpillChapterFuncs
519
520%\chapterimage{qlogic-full-36}
521%\chapter{Firmware hsi}
522%\begin{NOTICE}
523%Placeholder - owner Michal
524%\end{NOTICE}
525
526
527\chapterimage{qlogic-full-36}
528\chapter{Interrupts}
529\label{cha:int}
530This chapter describes how the device notifies the driver about operations -
531it describes how firmware status is reflected on host memory via status blocks, and how the firmware initiates an interrupt toward the driver.
532
533A reference document that fully describes status blocks can be found at \cite{doc:SB}.
534
535
536\section{Status blocks - host point of view}
537The \myindex{status block} structures are allocated on host memory. The status block is an array of indices which are updated by firmware (mainly ring consumer values).
538There are 288 status blocks per path in Big Bear and 368 in K2.
539
540When one of the indices on a status block is updated (because some event occurred at the device), the status block is copied from internal device memory to host memory, and an interrupt is generated.
541The CAU unit may aggregate several events and generate a single update of the status block and a single interrupt, in order to lower the number of interrupts sent to host CPU.
542
543The indices of the status blocks are referred to as \myindex{protocol indices} (abbreviated to \textit{pi}).
544Originally, the motivation behind multiple status blocks was to enable multiple protocols to work with the same status block, giving each protocol a different index.
545However, with single personality this is no longer the case.
546Multiple indices are used for L2 to differentiate between RX / TX and different class of service operations.
547
548\subsection{Initialization}
549\label{ssec:sb-init}
550There is a dedicated status block for ecore usage which is allocated and maintained by ecore.
551The fastpath status blocks used for traffic need to be allocated by the protocol driver.
552This memory must be DMA-coherent memory.
553The ecore defines a structure called \texttt{ecore\_sb\_info} which should be allocated by the protocol driver and initialized using the function \myfunc{int\_sb\_init}{int_sb_init}
554%[code snippet \ref{snippets/ecore_int_sb_init.h}].
555This structure is later used for calling the functions \texttt{ecore\_sb\_update\_sb\_idx()} and \texttt{ecore\_sb\_ack()}.
556
557%\insertcode{snippets/ecore_int_sb_init.h}{Initialize status blocks}
558
559\begin{NOTICE}
560	Status blocks need to be allocated and initialized before queues are created.
561\end{NOTICE}
562
563\section{Mode and configuration}
564The device can work in one of the following interrupt modes:
565\begin{enumerate}
566	\item INTA – Physical interrupt line.
567	\item MSI –  Message signaled interrupts. Device is programmed with one address to write to, and 16-bit data to identify the interrupt.
568	\item MSIX – Large number of interrupts (up to 2048) and each one gets a separate target address, making it possible to designate different interrupts to different processors.
569	This is the preferred mode for performance.
570	\item POLL – HW increments producers on status blocks in case of interrupts but it doesn't generate any message nor does it assert any physical line. It's the upper-layer responsibility to periodically poll on those changes to identify interrupts. \\
571\end{enumerate}
572
573Enabling and disabling interrupts is OS specific and done differently by the OS specific layer of the driver.
574However, the device needs to be configured differently according to the selected interrupt mode; This initialization is done by the ecore.
575
576In order to so, the proper interrupt mode using an \myindex{ecore\_int\_mode} enum [can be seen in code snippet [\ref{snippets/ecore_int_mode.h}]] needs to be passed when calling \texttt{ecore\_hw\_init}.
577
578\insertcode{snippets/ecore_int_mode.h}{Enum for the interrupt mode}
579
580If upper-layer driver would later wish to change the interrupt mode, it can do so by calling \myfunc{int\_igu\_enable\_int}{int_igu_enable_int},
581or to \myfunc{int\_igu\_disable\_int}{int_igu_disable_int} when wishing to disable interrupt generation altogether.
582
583%\insertcode{snippets/ecore_int_endis.h}{Functions for enabling/disabling interrupts}
584
585In MSIX mode, each status block should generate it's own interrupt message, meaning in reasonable OSes it should be possible to connect each interrupt with the specific handler of that interrupt's source.
586The \textit{sb\_id} passed as value to \textit{ecore\_int\_sb\_init()} will indicate the index of the vector in the MSI-X table that would be used to generate interrupts for this specific SB.
587I.e., if the value passed is $X$, then the $X^{th}$ MSI-X vector will generate interrupts for this SB.
588
589When working in INTA / MSI we work in single-ISR multiple-DPC mode; The same interrupt line can signify interrupts from many possible status blocks. In this case the information of which status block generated an interrupt needs to be read from a register in the IGU. Use \myfunc{int\_igu\_read\_sisr\_reg}{int_igu_read_sisr_reg} to get the information [returned value is a bitmask of status blocks which asserted the interrupt].
590
591%\insertcode{snippets/ecore_int_sisr.h}{INTA mechanism for reading interrupt source}
592
593\section{IGU block operation}
594The IGU block has a mapping of status blocks to interrupts.
595The mapping is done inside the IGU CAM and maps a (function, vector) pair to an MSI-X message.
596In case of INTA / MSI, each function has a register in the IGU stating which status block gave the interrupt.
597The IGU block is responsible for generating the interrupt. It receives the command to generate an interrupt from the CAU block.
598The IGU block maintains producer-consumer pairs per status block.
599The CAU updates the producer after it wrote the status block to host memory.
600The driver updates the consumer after it finished processing the status block.
601The IGU block generates an interrupt when there is a prod-cons difference on the status block.
602
603CAU also handles coalescing of status block writes and interrupt generation.
604The CAU unit may aggregate several events and generate a single update of the status block and a single interrupt, in order to lower the number of interrupts sent to host CPU.
605
606\section{Interrupt handling flow}
607\label{sec:sb-flow}
608The flow of handling an interrupt in the device and driver is as follows:
609\silentfunc{sb_update_sb_idx}
610\silentfunc{sb_ack}
611\begin{enumerate}
612	\item The device (Firmware/CAU) updates a status block index.
613
614	\item The device copies the status block to host memory and generates an interrupt.
615
616	\item OS is triggered, calling the driver's Interrupt Service Routine [ISR].
617
618	\item (Possible upper-half handling and bottom-half scheduling, or other OS-specifics which are outside the scope of this document).
619
620	\item Driver identifies a producer update on the status block (as the producer is written as part of the status block on host memory) using \myfunc{sb\_update\_sb\_idx}{sb_update_sb_idx}.
621
622	\item Driver scans the protocol indices in the status block to determine the interrupt source.
623	\begin{NOTICE}
624		It's likely the upper-layer doesn't really need to scan the status block, but rather compare values in some previous-supplied addresses against a shadow copy. E.g., In L2 the ecore callbacks configuring the queues will return the addresses which upper-layer should test for producer updates. See section [\ref{sec:l2-start}].
625	\end{NOTICE}
626
627	\item When Driver completes processing all the indices on the status block, it writes the producer value from the status block into the IGU consumer address, using \myfunc{sb\_ack}{sb_ack}.
628
629	\item The IGU compares the producer and consumer -- if they differ it will generate an additional interrupt.
630
631\end{enumerate}
632
633\begin{exampleT}
634	Assume an Rx packet is received by device. After FW places the packet in the Rx rings, it updates the status block of that Rx ring; This in turn is copied into host memory and an MSI-X interrupt for the appropriate Rx queue's status block is triggered.
635	Driver reads the status blocks, scanning the indicies and identifies the interrupt is an Rx CQE consumer and handles the incoming packet. Assuming this is the only interrupt source [and there was also a single packet] driver than acks the status block.
636\end{exampleT}
637\SpillChapterFuncs
638
639
640\chapterimage{qlogic-full-36}
641\chapter{Management firmware [MFW] interface}
642\label{cha:mfw}
643
644The management firmware runs on its own processor on the chip [\myindex{MCP}] and has many responsibilities – it serves as the entity initially configuring the chip [during bios phase], answering the various management protocols, synchronizing between PFs, configuring the physical link, etc.
645HW functions and the \myindex{MFW} may interact with each other in both ways – driver may send messages to the MFW in the form of commands on a buffer, while the MFW generates attentions for the driver and posts messages in a designated mailbox in the SHMEM. The implementation of the interface resides in \texttt{ecore\_mcp.[ch]}, with the addition of \texttt{.h} files generated by the MFW owners, e.g., \texttt{mcp\_public.h} which contains the SHMEM structure and the list of commands.
646The API that should be included by upper-layer driver is defined in \texttt{ecore\_mcp\_api.h}.
647
648The interface between driver and MFW is initialized as early as possible in the initial initialization flow [specifically as part of \texttt{ecore\_hw\_prepare()}],  as this initializes the Driver access to SHMEM which is used later during initialization to learn about the chip configuration [which was read from NVRAM by MFW and written into SHMEM].
649The upper layer doesn’t need to take care of allocating/releasing of this interface – it’s part of the greater initialization/de-initialization of the ecore.
650
651\section{Shared Memory [SHMEM]}
652The \myindex{shared memory} is a segment of memory accessible to all functions as well as the MFW. The memory is used for various purposes:
653\begin{enumerate}
654	\item MFW fills it with current HW configuration, either based on the default found in the NVRAM or based on some management-protocol [e.g., it’s possible vlans configuration is determined by switch and communicated to the MFW]. Driver reads those values and decides upon its logical state/configures HW appropriately. \\
655
656	\item The driver--MFW interface is based on mailboxes in well-known addresses in the SHMEM. \\
657
658	\item It’s possible [as in E3] that there will be driver-held information that will be requested by some management-protocol, and the driver will have to fill it in some well-known address in the SHMEM.
659\end{enumerate}
660
661An upper-layer driver is not supposed to access the SHMEM directly; It should only do so by using ecore functions and accessing ecore structs. The ecore \textit{mcp\_info} struct contains as one of its fields \textit{func\_info} which is filled by the ecore during early device initialization with all the function-specific static\footnote{i.e., data that shouldn't change while driver is running} data. Upper-layer driver can read those values for its own usage.
662
663\section{Ecore - MFW interface}
664\begin{itemize}
665	\item Sending messages from driver to MFW -- Each HW-function has an address in the SHMEM in which the MFW will poll for messages from that HW-function.
666	A message is a u32 consisting of a command bit-mask which indicates of the message the HW-functions sends and a cyclic sequential number.
667	In addition there’s another u32 field which might contain additional parameters [command-specific].
668	The driver increases the sequence number and writes the message and then polls until the MFW writes its response [with the correct sequence number] to another known address in SHMEM\footnote{Obviously, this is a one-pending mechanism.}
669	The MFW can also send an additional parameter [command-specific]. \\
670
671	\item Messages from MFW to driver -- MFW will trigger a general HW attention which will be handled by the specific HW-function [there’s a different general HW attention per HW-function].
672	Per-HW-function there’s an array of message producers in SHMEM,  of which the ecore maintains a copy.
673	Before sending the attention, the MFW will increment the producer of the message it wishes to inform the driver and the driver will recognize the message by noticing the difference in producers.
674	After handling said message, the driver will ack the message by writing the new producer back to SHMEM and disabling the general HW attention.
675	Notice it's [at least theoretically] possible for the ecore to encounter multiple MFW messages following a single attention from HW. \\
676\end{itemize}
677
678Notice the commands’ content vary -- some of the commands will require additional parameters to be filled in specific fields in the SHMEM before the commands are passed.
679
680\section{API between ecore's MCP interface and upper-layer driver}
681\myfunc{mcp\_cmd}{mcp_cmd} --  this is the very core of message-passing from driver to MFW. Upper-layer driver should pass the command (FW\_MSG\_CODE\_* from \texttt{mcp\_public.h}) and a parameter, as well as pointers for the MFW response and additional possible parameter. The function will pass the command for MFW and await [sleep] for its reply. \\
682
683A ‘special’ instance of this function is \texttt{ecore\_mcp\_load\_req()} [which isn’t an API function] - that function sends an indication to the MCP that the HW-function is being loaded.
684The MFW is used as both a book-keeper and synchronization mechanism for the loading of PFs, as there are communal resources. The response will be (FW\_MSG\_CODE\_DRV\_LOAD\_<X>), where X can be either ENGINE, PORT or FUNCTION:
685\begin{itemize}
686	\item Engine – HW-function is the first being loaded on its engine.
687	\item Port – Another HW-function has already initialized the engine, but this HW-function is first on its port.
688	\item Function – Another HW-function has already initialized the port.
689\end{itemize}
690According to the MFW response the ecore knows what need to be initialized. \\
691
692\texttt{ecore\_handle\_mcp\_events()} – This function is called from the slowpath interrupt context [sleepless] upon MFW attention to the driver.
693Dependent on the exact message received from the MFW, it’s possible that this will eventually will call some OSAL which will need to be implemented by the upper-layer driver, e.g., in case of link change indication [The upper-layer needs to be notified and should decide on its own what to do with that information].
694
697The MFW is responsible for configuring the physical link [i.e., MAC, PHY, etc.]. The ecore encapsulates the entire interface with MFW for configuring the link, leaving a relatively narrow API with the upper-layer driver.
698The ecore HW-function contains 2 related strctures –
701\begin{itemize}
702	\item Link\_params – The ecore uses this as inputs for configuring the link; According to the values in this struct, the ecore will later configure shmem in the appropriate places so that once the MFW receives the command to set the link it will use this configuratio.
703	During ecore initialization, the ecore will fill this structure with the default values from SHMEM [values set by MFW according to NVRAM configuration]
704When upper-layer driver wishes to update link configuration, it should change this struct.
706
707	\item Link\_output – The ecore fills the structure from attention handling context whenever the MFW indicates that a link change has occurred. Upper layer driver can read this to get information about the current state of the physical link. It can access this struct by calling \myfunc{mcp\_get\_link\_state}{mcp_get_link_state}.\\
708\end{itemize}
709
710In order to work with the ecore link interface, upper driver needs to implement an OSAL [\texttt{osal\_link\_update()}] which will be called whenever the link state has changed – this will notify the upper driver that the link has changed and that it should probably read link\_output and act upon it. \\
711
712In order to set/reset the link, the upper driver should call \myfunc{mcp\_set\_link}{mcp_set_link} after overriding the link\_params fields with its required link configured [optional, as without doing anything the structure will contain the default link configuration found in SHMEM].
713Passing true will cause MFW to try setting the link [either by force or via auto-negotiation, based on the configuration], while passing false will cause the MFW to reset the link.
714
715Notice the logic for link-flap-avoidance should be contained in MFW, e.g., in multi-function mode there’s no need for the upper-layer driver to count the number of functions loaded in order to decide whether during unload it should request a link reset; It should do it regardless.
716It’s the MFW's duty to decide whether the unloading function is actually the last loaded function on its port and thus whether to actually reset the link.
717
718\subsection{Energy Efficient Ethernet (EEE)}
719EEE feature enables the device to put its transistors in sleep mode when there is no data activity on the wire. Hence achieves the significant reduction in the power consumption of the device. It's a Base-T feature, more details of which are captured under IEEE 802.3az standard. MFW negotiates the EEE parameters with the peer device and the results will be shared to the ecore as part of link notification. Following are the negotiated parameters which will be encapsulated in the struct \texttt{ecore\_mcp\_link\_state}.
720\begin{itemize}
721	\item eee\_active – EEE is negotiated and is currently operational.
724\end{itemize}
725Following are the EEE link parameters which can be queried by upper layer driver using \myfunc{mcp\_get\_link\_params}{mcp_get_link_params} API.
726\begin{itemize}
727	\item eee\_enable – EEE is enabled.
728	\item eee\_supported – Device supports EEE.
729	\item eee\_tx\_lpi\_enable – Determines whether the device should assert its Tx LPI.
730	\item eee\_tx\_lpi\_timer – EEE delay timer value, i.e., amount of time device should stay in idle mode prior to asserting its Tx LPI  (in  microseconds).
731\end{itemize}
732Upper layer driver can configure the one or more of the EEE following parameters.
733\begin{itemize}
734	\item eee\_enable
736	\item eee\_tx\_lpi\_enable
737	\item eee\_tx\_lpi\_timer
738\end{itemize}
739
740\section{Dcbx Interface}
741\label{sec:mfw-dcbx}
742The MFW is responsible for negotiating the dcbx parameters [e.g., per priority flow control (PFC)] with peer device. During initialization, MFW reads the dcbx parameters from NVRAM (called local parameters) and negotiates these with the peer. The negotiated/agreed parameters are called operational dcbx parameters. MFW provides driver interfaces for querying and configuring the dcbx parameters. The ecore dcbx implementation provides three APIs, one for querying the dcbx paramters and the other two for updating the dcbx configuration.
743\silentfunc{dcbx_query_params}
744\silentfunc{dcbx_get_config_params}
745\silentfunc{dcbx_config_params}
746\begin{itemize}
747	\item \myfunc{dcbx\_query\_params}{dcbx\_query\_params} – The API returns the current dcbx configuration. It expects type (i.e., local/remote/operational) and the buffer for storing the dcbx parameters of that type.\\
748
749	\item \myfunc{dcbx\_get\_config\_params}{dcbx\_get\_config\_params} - The API returns the currently cached dcbx parameter set that can be modified for making the dcbx update requests. \\
750
751	\item \myfunc{dcbx\_config\_params}{dcbx\_config\_params} – The API is used for sending the dcbx parameters update request. The API expects dcbx parameters to be configured and the flag specifying whether the parameters need to be sent to hardware or just cache at the ecore. When driver sends dcbx config to the hardware, device initiates the dcbx negotiation with the peer using lldp protocol. The negotiation takes few seconds to complete, and also the lldp requests are rate limited (using a predefined credit value). The dcbx API option “hw\_commit” specifies whether the dcbx parameters need to be committed to the hardware or just cache at the driver. When client requests the commit, all the cached parameters are sent to the device and the parameter negotiation will be initiated with the peer. \\
752\end{itemize}
753The steps for configuring the dcbx parameters are, upper layer driver invokes ecore\_dcbx\_get\_config\_params() API to get the current config parameter set, and update the required parameters, and then invoke ecore\_dcbx\_config\_params() API.
754
755If there is any change in the dcbx configuration at the host (for example due to a negotiation with the peer), then MFW notifies the same to ecore. OSAL\_DCBX\_AEN() would be called after such notification, ecore client would need to provide the implementation for this OSAL.
756
757\section{Management protocol APIs}
758\label{sec:mfw-protocols}
759MFW needs various bits of information from the driver, and it gathers those in one of two methods:
760\begin{itemize}
761	\item Pulling – if ecore can’t provide information on its own, ecore-client would be required to implement an OSAL.\\
762	\item Pushing – it’s the ecore and ecore-client’s responsibility to push the data.\\
763\end{itemize}
764In some cases, ‘Push’ is done without involvement of the ecore-client. If that’s not possible, it becomes more risky as the responsibility of doing things correctly passes to the ecore-client. Ecore-client shouldn’t presume to do ‘push’ only for calls which match the configured management mode. Instead it should always do them and let the ecore be the arbiter of whether those are needed by MFW or not. Ecore provides the following APIs for updating the configuration attributes, it is the client's responsibility to invoke these APIs at the appropriate time.
765\silentfunc{mcp_ov_update_current_config}
766\silentfunc{mcp_ov_update_mtu}
767\silentfunc{mcp_ov_update_mac}
768\silentfunc{mcp_ov_update_wol}
769\silentfunc{mcp_ov_update_driver_state}
770\silentfunc{mcp_update_fcoe_cvid}
771\silentfunc{mcp_update_fcoe_fabric_name}
772\begin{itemize}
773	\item \myfunc{mcp\_ov\_update\_current\_config}{mcp\_ov\_update\_current\_config} – Drivers need to call this API when user updates one (or more) of the following: mtu, primary mac or Wake on LAN settings (to a non-default value). In addition, it also needs to call a unique API per each:
774	\begin{itemize}
775		\item \myfunc{mcp\_ov\_update\_mtu}{mcp\_ov\_update\_mtu} – called when user sets the mtu to a value other than the default provided by the ecore.\\
776
777		\item \myfunc{mcp\_ov\_update\_mac}{mcp\_ov\_update\_mac} – called when user updates the primary mac address.\\
778
779		\item \myfunc{mcp\_ov\_update\_wol}{mcp\_ov\_update\_wol} – called when Wake-on-LAN settings are updated.\\
780	\end{itemize}
781	\item \myfunc{mcp\_ov\_update\_driver\_state}{mcp\_ov\_update\_driver\_state} – notify about a change in the driver state. Following are the possible driver states,
782	\begin{itemize}
784
785		\item ECORE\_OV\_DRIVER\_STATE\_DISABLED - Driver is not ready yet.\\
786
787		\item ECORE\_OV\_DRIVER\_STATE\_ACTIVE - Driver is operational.\\
788	\end{itemize}
789	Ecore sets the following driver states,
790	\begin{itemize}
791		\item DISABLED - After firmware is successfully loaded on the device, ecore updates the driver state as DISABLED (as part of ecore\_hw\_init() implementation). \\
792		\item NOT\_LOADED - Ecore sets this state when the protocol driver is unloaded (as part of ecore\_hw\_remove()).\\
793	\end{itemize}
794	It's the protocol driver's responsibility to alternate between the states,
795	\begin{itemize}
796		\item ACTIVE - Set when the required initialization is done from the driver side and the device is ready for traffic switching.\\
797		\item DISABLED - Set when device is not operational (e.g., fastpath queues are released or not configured).\\
798	\end{itemize}
799	\item \myfunc{mcp\_update\_fcoe\_cvid}{mcp_update_fcoe_cvid} - Update MFW with the 802.1q fcoe vlan id assigned for the PF.\\
800	\item \myfunc{mcp\_update\_fcoe\_fabric\_name}{mcp_update_fcoe_fabric_name} - Update fabric name value to the MFW. Fabric name is the value returned by the fabric domain controller in response to a GS-FC “Get Fabric Name” command from the adapter.\\
801\end{itemize}
802
803Ecore also provides the TLV request interface for MFW for querying the driver/device attributes. MFW uses mailbox interface to notify ecore on the required TLV information. Ecore parses the request, populates the required information with the help of ecore clients and sends it to the MFW. Ecore client need to provide necessary infrastructure and the OSALs for implementing this interface.
804\begin{itemize}
805	\item OSAL\_MFW\_TLV\_REQ - The call indicates that ecore has received a TLV request notification from the MFW. The execution context in interrupt mode, hence ecore client need to schedule a thread/bottom-half context to handle this task, and return the control immediately. The bottom-half thread will need to invoke \myfunc{mfw\_process\_tlv\_req}{mfw_process_tlv_req} for further processing of the TLV request.\\
806	\item OSAL\_MFW\_FILL\_TLV\_DATA - Ecore invokes this callback to get the TLV values of a given type. Ecore client need to fill in the values for all the fields that it's aware of, and also need to set the flags associated with the respective fields. For instance,  if client sets value for 'npiv\_enabled' field, it needs to set the flag 'npiv\_enabled\_set' to true.\\
807\end{itemize}
808
809\SpillChapterFuncs
810
811
812\chapterimage{qlogic-full-36}
813\chapter{L2 protocol}
814\label{cha:l2}
815
816\section{L2-related terminology}
817This section describes in a very highlevel manner several FW objects which are related to L2. Developers implementing L2 support over the ecore should be familiar with these\footnote{Probably even more than is in the scope of this document.}.
818\begin{itemize}
819	\item Virtual port [\myindex{VPORT}] -- Can simply be seen as a collection of queues, each HW-function will have at least one VPORT configured\footnote{And in most scenarios one will suffice.}. Classifications are configured per-VPORT. \\
820
821	\item Queues -- Either Rx/Tx, queues are attached to a VPORT. There can multiple queues per-VPORT [e.g., if RSS/TSS is supported]. Usually, each Rx queue will use it's own status block for interrupts upon Rx packets but Tx queues can utilize the same status blocks, using different protocol indices. \\
822\end{itemize}
823
824\section{Starting an L2 device}
825\label{sec:l2-start}
826This section begins after section \ref{sec:init-init}, I.e., assuming the HW-function has already been initialized by the init tool and the PF\_START ramrod has already been sent.
827
828	\begin{NOTICE}
829	Although VPORTs' and queues' indices are shared between all HW-function on the same engine, the resource allocation scheme determines a range of VPORTs per-HW-function to use for configuration [i.e., developer can assume starting index is always 0 per-HW-function].
830	\end{NOTICE}
831
832
833\silentfunc{sp_vport_start}
834\silentfunc{eth_rx_queue_start}
835\silentfunc{eth_tx_queue_start}
836\silentfunc{sp_vport_update}
837\begin{enumerate}
838	\item \myfunc{sp\_vport\_start}{sp_vport_start} -- this function initializes a vport in FW [ETH\_RAMROD\_VPORT\_START will be sent]. The handle for this function is a \texttt{vport\_id} which is passed and the most 'interesting' argument is the MTU for that VPORT.
839	This VPORT will be inactive after sending this ramrod, i.e., until enabling it via a vport update it will not actually perform Rx/Tx. \\
840
841	\item \myfunc{eth\_rx\_queue\_start}{eth_rx_queue_start} -- initializes an rx queue on a given VPORT.
842	A pre-request is that the VPORT has already been initialized.
843	There are 2 identifier of the queue - the queue index to add and the VPORT index to add it to. The queue-index should be unique for the Rx-queue; No 2 Rx-queues of the same PF should use the same id.
844	There are quite a few parameters that need to be supplied, e.g., status block, physical addresses of rings, etc.
845
846	The function is expected to receive a pointer to a \texttt{p\_ret\_params} which it will fill with outputs [upon success]. The ecore would fill the address where producer-updates need to be written [in the storm's RAM]; The upper-driver will write producer updates to that address to replenish its Rx-rings.
847
848	\begin{NOTICE}
849	Address is mapped by GTT, so upper-driver can simply write to that address, using the necessary memory barriers.
850	\end{NOTICE}
851	In addition, ecore would also fill a \texttt{p\_handle}. This handle is opaque to the ecore-client, and should be passed to other Rx-queue APIs when doing configuration relating to that queue.
852
853	After calling this function, upper-layer driver should initialize the Rx packets producers. \\
854
855	\item \myfunc{eth\_tx\_queue\_start}{eth_tx_queue_start} -- initializes a Tx queue on a given VPORT0.
856		Very similar to the Rx queue start method, with some slight differences in the parameters [BD ring address instead of Rx rings, etc.]. For Tx-queues, the same queue-id can be shared between 2 different queues. That would cause those queues to share the same coalescing configuration.
857		Just like for Rx-queues, the ecore would fill the \texttt{p\_ret\_params} with an opaque handler to be used for further calls relating to this queue. In addition, it will provide a \texttt{p\_doorbell} address, which is an address into which a doorbell needs to be written to activate firmware once a packet is placed on this Tx queue and the buffer descriptors are filled.
858		\begin{NOTICE}
859		Doorbell addresses are on a different BAR than that of other memories/registers accessed by driver, and the PTT/GTT scheme does not apply to it; Thus the address can simply be accessed using the necessary memory barriers.
860		\end{NOTICE}
861
862	\item \myfunc{sp\_vport\_update}{sp_vport_update} -- This is required to enable the VPORT. It should be called after the Tx/Rx queues were already added, and this will enable the VPORT to send and receive packets\footnote{Notice that without classification configuration Rx won't actually work. Also notice this function can do a lot of things; Enabling the VPORT is only one of them.}.
863
864	In order to enable the VPORT for traffic, the upper-layer driver should set in \texttt{p\_params} the following:
865	\begin{enumerate}
866		\item \texttt{update\_vport\_active\_flg} to 1.
867		\item \texttt{vport\_active\_flg} to 1.
868	\end{enumerate}
869
870\end{enumerate}
871
872\section{Configuring Classifications}
873\label{sec:l2-class}
874Classification configuration consists [mostly] of three thing:
875\begin{enumerate}
876	\item Configuration of the \myindex{Rx mode} -- This defines which datagrams [unicast, multicast, broadcast] should be accepted by the VPORT, and whether all such datagrams or only if a filter is configured for them.
877	\item Configuration of unicast / multicast filters -- defining filters for specific unicast / multicast addresses which should be matched, given that Rx mode  agrees.
878	\item Configuration of vlan filters -- by default, all vlans will be accepted. If at least one vlan [or vlan-mac] filter will be configured only traffic which matches one of the configured vlan filters will pass through.
879\end{enumerate}
880
881There are several ecore functions which are responsible for configuring classifications:
882
883\silentfunc{filter_accept_cmd}
884\silentfunc{sp_eth_filter_ucast}
885\silentfunc{filter_mcast_cmd}
886\begin{itemize}
887	\item \myfunc{filter\_accept\_cmd}{filter_accept_cmd} -- configures the Rx mode of the device.
888	\item \myfunc{sp\_vport\_update}{sp_vport_update} -- although not exactly a classification function, calling this will re-set the Rx mode [this calls \texttt{ecore\_filter\_accept\_cmd()} as part of its work].
889	\item \myfunc{sp\_eth\_filter\_ucast}{sp_eth_filter_ucast} -- configures either a unicast filter, vlan filter or a unicast/vlan filter pair.
890		An important parameter for the upper-layer driver\footnote{in the sense that it might affect design, since all fields are relevant.} is the opcode' field:
891		\begin{itemize}
893			\item ECORE\_FILTER\_REMOVE -- removes a filter.
894			\item ECORE\_FILTER\_MOVE -- removes a filter from one vport and adds it to another simultaneously\footnote{Needed by windows.}.
895			\item ECORE\_FILTER\_REPLACE -- adds a new filter after removing all previously configured filters.
896		\end{itemize}
897
898	\item \myfunc{filter\_mcast\_cmd}{filter_mcast_cmd} -- configures a multicast filter.
899	\begin{warning}
900		This function exists in the ecore but at the moment it's not implemented.
901	\end{warning}
902
903\end{itemize}
904These functions expose the \texttt{ecore\_spq} implementation -- upper-driver layer can choose whether to wait for completion, supply a callback for completion or do-nothing upon completion (the last will usually be the chosen path).
905
906\section{Stopping an L2 device}
907This is pretty straight forward, and works in reverse-order to the initialization of the L2 device.
908After upper-layer driver guarantees that no new Tx-packets will be generated and once Tx queues are all empty, it should do the following:
909\silentfunc{eth_tx_queue_stop}
910\silentfunc{eth_rx_queue_stop}
911\silentfunc{sp_vport_stop}
912\begin{enumerate}
913	\item Disable the VPORT by calling \texttt{ecore\_vport\_update()} after setting:
914	\begin{enumerate}
915		\item \texttt{update\_vport\_active\_flg} to 1.
916		\item \texttt{vport\_active\_flg} to 0.
917	\end{enumerate}
918
919	\item Close all Tx queues\footnote{Actually, order does not matter between Tx and Rx queues}  by calling \myfunc{eth\_tx\_queue\_stop}{eth_tx_queue_stop}.
920
921	\item Close all Rx queues by \myfunc{eth\_rx\_queue\_stop}{eth_rx_queue_stop}.
922
923	\item Close the vport by calling \myfunc{sp\_vport\_stop}{sp_vport_stop}.
924\end{enumerate}
925
926Following the completion of the \texttt{vport\_stop}, no further traffic should be working. Interrupts can be released, and resources can freed.
927Notice this on its own doesn't return the device into a 'clean-slate' state; There are still several non-L2 things that needs to be done [e.g., cleaning the status blocks of the queues]
928\SpillChapterFuncs
929
930\chapterimage{pictures/qlogic-full-36.jpg}
931\chapter{100G support}
932\label{cha:100}
933
934Our device supports \myindex{100G} link. However, the fastpath pipeline of each HW engine isn't fast enough for that line-rate. The Hardware function term is a catchphrase for the HW resource and identifications normally required by a single pci function. In 100G mode, the device will enumerate as a single pci function\footnote{Or more in multi-function mode; But we will stick with single-function mode for simplicity here.}, but the driver running over this pci function will utilize multiple HW functions.
935From pci standpoint, the distinction between the HW functions (and thus the HW engines) is done via the bar address. Access to the first half of each of the pci function's bars will be translated into an access into a HW function on the first engine, while access to the second half will be translated into an access into a HW function on the second engine.
936From the wire standpoint, both HW-functions are connected to a single physical port, i.e. transmitting traffic from either HW-function will lead to transmission on the same physical port. Incoming traffic from the port is routed to a hardware engine according to its protocol 4-tuple. The HW block responsible for this routing it the \myindex{OPTE}.
937
938This special configuration is also sometimes referred to as \myindex{Couple Mode Teaming} or \myindex{CMT}.
939
940After the early initialization phase of the ecore (i.e., following ecore\_hw\_prepare()), the \textit{ecore\_dev} field \myindex{num\_hwfns} will be filled with the correct number of HW-functions under the PCI device. The ecore and its client should access only the first num\_hwfns entries in the \textit{hwfns} array.
941
942\section{Effects on MSI-X interrupts}
943\label{sec:100int}
944Each path has its own IGU CAM, meaning it has its own set of available status block. But as both HW-functions share the same PCI function, there is a single MSI-X table for that device.
945As a result, when in CMT the MSI-X vectors are split between the two hw-func/Incomtions.
946
947\begin{exampleT}
948\label{ex:CMT1}
949Assume a PCI function is in CMT mode. Let $\text{hwfn}_0$ stand for its HW-function under the first engine and $\text{hwfn}_1$ stand for its HW-function under the second engine.
950Let $\text{MSIX}_i$ stand for the $i^{th}$ entry in the PCI function's MSI-X table.
951
952Then for $\forall n \in \mathbb{N}_{+}$, $\text{MSIX}_{2n}$ is connected to $\text{hwfn}_0$'s status block of index $n$, and $\text{MSIX}_{2n+1}$ is connected to $hwfn_1$'s status block of index $n$.
953\end{exampleT}
954
955\section{Effects on device slowpath configuration}
956Ecore handles almost all the difference between CMT and regular mode on it's own, i.e., it reads the number of HW-functions under the devices and iterates when needed to configure both engines correctly (where as in the non-CMT mode it would have simply configured one).
957What it does require is:
958\begin{itemize}
959	\item Implement OSAL\_BAR\_SIZE. Ecore uses it to determine where it needs to split the bars; Without it it's very likely things will fail very early during initialization.
960
961	\item Set the HW-function's pf\_params for each HW-function before calling \textit{ecore\_resc\_alloc}.
962
963	\item Enable slowpath interrupts -- the first 2 MSI-X vectors should be used for slowpath. Notice that the ecore itself will call OSAL\_DPC\_INIT for each HW-function.
964	\begin {exampleT}
965		following Example [\ref{ex:CMT1}], $\text{MSIX}_0$ should be enabled and connected to the DPC of $\text{hwfn}_0$ and $\text{MSIX}_1$ should be enabled and connected to the DPC of $\text{hwfn}_1$.
966	\end{exampleT}
967\end{itemize}
968
969When disabling the slowpath, it's important to remember that there were 2 different DPCs allocated and 2 MSI-X vectors configured to support them, as it's the ecore-client responsibility for disabling the interrupts.
970
971\section{Effects on L2 fastpath configuration}
972Since each HW-function is running on a different path and is an independent entity (as perceived by FW/HW), configuration should be almost symmetric for both HW-functions. E.g., Following the flow of section \ref{sec:l2-start}, ecore\_sp\_vport\_start() should be called separately for each HW-function, queues should be opened separately for each, etc..
973
974Notice that in most cases you can even use the same indices, since FW-indices are per-path. E.g., you can use $\text{vport}_0$ on both HW-functions, since they are different on each path.
975
976\begin{warning}
977	When allocating the status blocks for your queues, do recall that the MSI-X table is shared between the engines, as explained in section [\ref{sec:100int}].
978\end{warning}
979
980\begin{NOTICE}
981	There is an issue between the user control of the number of queues and the actual configuration of queues - e.g., assume user wants $X$ queues. If we use a symmetric configuration what we actually do is open $X$ queues on each path, meaning we actually open $2X$ queues.
982
983	We can either only open $X/2$ queues on each engine, in which case we lose some abilities, e.g., control the keys of the RSS hash-function, or open $2X$ queues and try to hide this fact from user, but this most likely will either incur a performance penalty, hard-to-maintain code or both.
984\end{NOTICE}
985
986\chapterimage{qlogic-full-36}
987\chapter{iSCSI protocol}
988\label{cha:iscsi}
989
990This chapter describes the ecore interface for the upper-layer driver of the iSCSI protocol.
991
992\section{Start iSCSI PF}
993\silentfunc{sp_iscsi_func_start}
994\silentfunc{iscsi_get_global_cmdq_cons}
995\begin{itemize}
996	\item The basic initialization process is described in section \ref{sec:init-init} for all protocols. \\
997	Specifically for iSCSI, before calling \texttt{ecore\_resc\_alloc()}, the upper driver should determine the PF-global parameters, allocate all PF-global queues, and fill the \texttt{iscsi\_pf\_params} part in struct \texttt{ecore\_pf\_params}. \\
998	The following table describes the parameters that should be filled (the rest should be zero):
999	\begin{center}
1000		\begin{tabular}{| l | p{10cm} |}
1001		\hline
1002		\textbf{Parameter} & \textbf{Description} \\ \hline
1003		\texttt{num\_cons} & Up to 4K are supported, suggested default value 128 \\ \hline
1004		\texttt{num\_tasks} & Up to 4K are supported, suggested default value 1K \\ \hline
1005		\texttt{half\_way\_close\_timeout} & Timeout from sending FIN until abortive close, suggested default value 10sec \\ \hline
1006		\texttt{num\_sq\_pages\_in\_ring} & Number of outstanding tasks on the connection * 8B / page-size. \newline Suggested default value for number of outstanding tasks on the connection 256 \\ \hline
1007		\texttt{num\_r2tq\_pages\_in\_ring} & Same as \texttt{num\_sq\_pages\_in\_ring} \\ \hline
1008		\texttt{num\_uhq\_pages\_in\_ring} & Number of outstanding un-ACKed PDUs, suggested default value -- same as \texttt{num\_sq\_pages\_in\_ring} \\ \hline
1009		\texttt{num\_queues} & Number of global queues (CQ / CmdQ / RQ). \newline This should be $\leq$ number of available MSIX vectors for the PF \\ \hline
1010		\texttt{log\_page\_size} & 12 for 4KB pages \\ \hline
1011		\texttt{glbl\_q\_params\_addr} & The physical address of the list of pointers to the arrays of pointers to global queues pages. \newline The list is built as follows: CQ\#0 PBL pointer, RQ\#0 PBL pointer, CmdQ\#0 PBL pointer, CQ\#1 PBL pointer, RQ\#1 PBL pointer, CmdQ\#1 PBL pointer, etc. \newline Each PBL pointer points to the physical address which contains an array of pointers to the physical addresses of the specific queue pages. \\ \hline
1012		\texttt{rqe\_log\_size} & 8 for 256B RQE \\ \hline
1013		\texttt{rq\_num\_entries} & Number of RQ entries, suggested value for Initiator 16 (4KB RQ), for Target 128 \\ \hline
1014		\texttt{cq\_num\_entries} & \texttt{num\_tasks} + \texttt{rq\_num\_entries} \\ \hline
1015		\texttt{cmdq\_num\_entries} & Number of CmdQ entries, suggested default value \texttt{num\_tasks} \\ \hline
1016		\texttt{max\_cwnd} & Max congestion window, suggested default value 0xFFFFFFFF (no limit) \\ \hline
1017		\texttt{dup\_ack\_threshold} & Dup-ACK counter, suggested default value 3 \\ \hline
1018		\texttt{max\_fin\_rt} & Number of FIN retransmits before abortive close, suggested default value 3 \\ \hline
1019		\texttt{gl\_rq\_pi} & The index in the status-block for CQ completions, suggested value 0 \\ \hline
1020		\texttt{gl\_cmd\_pi} & The index in the status-block for CmdQ completions, suggested value 1 \\ \hline
1021		\end{tabular}
1022	\end{center}
1023	\item After the basic initialization process is completed successfully, it is possible to establish the LL2 queue, and send / receive LL2 packets (as described in section \ref{cha:ll2}).
1024	\item \myfunc{sp\_iscsi\_func\_start}{sp_iscsi_func_start} -- this function initializes the iSCSI PF, and passes PF-global parameters to FW. This function should be called before offloading any iSCSI connection.
1025	\item \myfunc{iscsi\_get\_global\_cmdq\_cons}{iscsi_get_global_cmdq_cons} -- this function returns the address in the device for updating RQ CONS for the specified queue.
1026\end{itemize}
1027
1028\section{Establish iSCSI connection}
1029\silentfunc{iscsi_acquire_connection}
1032\silentfunc{iscsi_update_connection}
1033\begin{itemize}
1034	\item \myfunc{iscsi\_acquire\_connection}{iscsi_acquire_connection} -- this function allocates the resources for the connection. \texttt{p\_in\_conn} which is passed to this function should be NULL. Note that ecore allocates by itself struct \texttt{ecore\_iscsi\_conn}, and returns its pointer to the upper driver via \texttt{p\_out\_conn}. Amongst others, ecore initializes in this struct the \texttt{icid} to be used in later task initialization, and the \texttt{conn\_id} which is zero based index.
1035	\item \myfunc{iscsi\_offload\_connection}{iscsi_offload_connection} -- this function offloads the connection to the device, and requests to establish the TCP connection. Before calling this function, the upper driver should determine the connection TCP parameters, allocate the connection SQ, and fill parameters in \texttt{ecore\_iscsi\_conn} struct. \\
1036	The following table describes the parameters that should be filled:
1037	\begin{center}
1038		\begin{tabular}{| l | p{10cm} |}
1039		\hline
1040		\textbf{Parameter} & \textbf{Description} \\ \hline
1041		\texttt{tcp\_on\_chip\_1b} & 1 \\ \hline
1042		\texttt{sq\_pbl\_addr} & The physical address of the array of pointers to the physical addresses of the SQ pages \\ \hline
1043		\texttt{local\_mac} & Local MAC address \\ \hline
1044		\texttt{remote\_mac} & Remote MAC address \\ \hline
1045		\texttt{vlan\_id} & VLAN ID \\ \hline
1046		\texttt{flags} & TS\_EN (timestamp enable) -- suggested default value 1 \newline DA\_EN (delayed-ACK enable) -- suggested default value 1 \newline DA\_CNT\_EN (delayed-ACK counter enable) -- suggested default value 1 \newline SACK\_EN (SACK enable) -- NA (not supported for iSCSI) \newline KA\_EN (keep-alive enable) -- suggested default value 1 \newline NAGLE\_EN (nagle enable) = NA (not supported for iSCSI) \newline FIN\_SENT – should be 0 \newline FIN\_RECEIVED -- should be 0 \\ \hline
1047		\texttt{ip\_version} & IP version \\ \hline
1048		\texttt{remote\_ip} & Remote IP address \\ \hline
1049		\texttt{local\_ip} & Local IP address \\ \hline
1050		\texttt{ka\_max\_probe\_cnt} & Number of keep-alive probe retransmits before indicating connection error, suggested default value 10 \\ \hline
1051		\texttt{dup\_ack\_theshold} & Dup-ACK counter, suggested default value 3 \\ \hline
1052		\texttt{rcv\_next} & In passive-open, SYN sequence number + 1. NA in active open \\ \hline
1053		\texttt{rcv\_wnd} & The window to advertise to the peer (before the scaling) \\ \hline
1054		\texttt{snd\_wl1} & In passive-open, SYN sequence number. NA in active open \\ \hline
1055		\texttt{cwnd} & Initial congestion window, suggested default value MSS \\ \hline
1056		\texttt{ss\_thresh} & Slow-start threshold, suggested default value 65535 \\ \hline
1057		\texttt{srtt} & Smoothed round-trip time, suggested default value 300 (300msec) \\ \hline
1058		\texttt{rtt\_var} & Round-trip time variation, suggested default value 150 (150msec) \\ \hline
1059		\texttt{ts\_recent} & In passive-open, the timestamp value in the SYN packet. NA in active open \\ \hline
1060		\texttt{flow\_label} & Flow label for IPv6, NA for IPv4 \\ \hline
1061		\texttt{ka\_timeout} & Timeout before the next KA after receiving ACK, suggested default value 7200000 (2 hours) \\ \hline
1062		\texttt{ka\_interval} & Timeout before the next KA after sending KA probe, suggested default value 10000 (10 sec) \\ \hline
1063		\texttt{max\_rt\_time} & Maximum retransmit time before indicating connection error, suggested default value 20sec \\ \hline
1064		\texttt{ttl} & Time-to-live for IPv4, hop-limit for IPv6 \\ \hline
1065		\texttt{tos\_or\_tc} & Type-of-service for IPv4, traffic-class for IPv6 \\ \hline
1066		\texttt{remote\_port} & Remote TCP port \\ \hline
1067		\texttt{local\_port} & Local TCP port \\ \hline
1068		\texttt{mss} & Maximum segment size \\ \hline
1069		\texttt{snd\_wnd\_scale} & In passive-open, taken from the TS-scale option in the received SYN packet, NA for active open \\ \hline
1070		\texttt{rcv\_wnd\_scale} & Receive window scale, suggested default value 4 \\ \hline
1071		\texttt{ts\_ticks\_per\_second} & Time-stamp resolution, suggested default value 1000 (1msec) \\ \hline
1072		\texttt{da\_timeout\_value} & Delayed-ACK timeout, suggested default value 200 (msec) \\ \hline
1073		\texttt{ack\_frequency} & Delayed-ACK counter, suggested default value 2 \\ \hline
1074		\texttt{default\_cq} & The desired queue number for completing un-solicited packets / commands \\ \hline
1075		\end{tabular}
1076	\end{center}
1077	When this call completes, the connection is offloaded and 3-way handshake started. 3-way handshake completion is indicated by an asynchronous call from ecore.
1078	After this call completes (and even before the asynchronous call), driver can post Login PDU to SQ. However FW will process SQ only after 3-way handshake is completed.
1079	\item \myfunc{iscsi\_get\_db\_addr}{iscsi_get_db_addr} -- this function returns the address in the device for updating SQ PROD for the specified CID.
1080	\item \myfunc{iscsi\_update\_connection}{iscsi_update_connection} -- this function sends updated iSCSI connection parameters to the device, after Login negotiation ended successfully. Before calling this function, the upper driver should fill parameters in \texttt{ecore\_iscsi\_conn} struct. The following table describes the parameters that should be filled:
1081	\begin{center}
1082		\begin{tabular}{| l | p{10cm} |}
1083		\hline
1084		\textbf{Parameter} & \textbf{Description} \\ \hline
1085		\texttt{update\_flag} & The negotiated values for HeaderDigest, DataDigest, InitialR2T and ImmediateData \\ \hline
1086		\texttt{max\_seq\_size} & The negotiated value for MaxBurstLength \\ \hline
1087		\texttt{max\_pdu\_size} & The negotiated value for MaxRecvDataSegmentLength \\ \hline
1088		\texttt{first\_seq\_length} & The negotiated value for FirstBurstLength \\ \hline
1089		\texttt{exp\_stat\_sn} & For Initiator, StatSN from the Login response + 1. NA for Target \\ \hline
1090		\end{tabular}
1091	\end{center}
1092\end{itemize}
1093
1094\section{Close iSCSI connection}
1095\silentfunc{iscsi_terminate_connection}
1096\silentfunc{iscsi_release_connection}
1097\begin{itemize}
1098	\item \myfunc{iscsi\_terminate\_connection}{iscsi_terminate_connection} -- this function removes the connection from the device, and requests to close the TCP connection. When this call completes, the connection closure state machine has started, but the connection is still offloaded. Connection closure and removal from the device is indicated by an asynchronous call from ecore.
1099	Before calling this function, driver needs to clean all outstanding tasks on the connection by sending cleanup requests via SQ. Clear-SQ / drain may be needed in exceptional cases. \\
1100	Logout PDU, if desired, should be posted to SQ before calling this function.
1101	\item \myfunc{iscsi\_release\_connection}{iscsi_release_connection} -- this function releases the resources for the connection. It should be called only after the asynchronous call from ecore on connection termination is received.
1102\end{itemize}
1103
1104\section{Close iSCSI PF}
1105\silentfunc{sp_iscsi_func_stop}
1106\begin{itemize}
1107	\item \myfunc{sp\_iscsi\_func\_stop}{sp_iscsi_func_stop} -- this function closes the iSCSI PF. This function should be called only after all the connections on the PF were closed.
1108	\item If an LL2 queue was established, it should be closed before continuing with the de-initialization process.
1109	\item The rest of the de-initialization process is described in section \ref{sec:init-de-init} for all protocols.
1110\end{itemize}
1111
1112\section{Getting statistics}
1113\myfunc{iscsi\_get\_stats}{iscsi_get_stats} can be used to query the device for various protocol-related statistics.
1114
1115\SpillChapterFuncs
1116
1117\chapterimage{qlogic-full-36}
1118\chapter{FCoE protocol}
1119\label{cha:fcoe}
1120
1121This chapter describes the ecore interface for the upper-layer driver of the FCoE protocol.
1122
1123\section{Start FCoE PF}
1124\silentfunc{sp_fcoe_func_start}
1125\silentfunc{fcoe_get_global_cmdq_cons}
1126\begin{itemize}
1127	\item The initialization process is described in section \ref{sec:init-init} for all protocols. \\
1128	Specifically for FCoE, before calling \texttt{ecore\_resc\_alloc()}, the upper driver should determine the PF-global parameters, allocate all PF-global queues, and fill the \texttt{fcoe\_pf\_params} part in struct \texttt{ecore\_pf\_params}. \\
1129	The following table describes the parameters that should be filled (the rest should be zero):
1130	\begin{center}
1131		\begin{tabular}{| l | p{10cm} |}
1132		\hline
1133		\textbf{Parameter} & \textbf{Description} \\ \hline
1134		\texttt{num\_cons} & Up to 4K are supported, suggested default value 128 \\ \hline
1135		\texttt{num\_tasks} & Up to 4K are supported, suggested default value 1K \\ \hline
1136		\texttt{glbl\_q\_params\_addr} & The physical address of the list of pointers to the arrays of pointers to global queues pages. \newline The list is built as follows: CQ\#0 PBL pointer, RQ\#0 PBL pointer, CmdQ\#0 PBL pointer, CQ\#1 PBL pointer, RQ\#1 PBL pointer, CmdQ\#1 PBL pointer, etc. \newline Each PBL pointer points to the physical address which contains an array of pointers to the physical addresses of the specific queue pages. \\ \hline
1137		\texttt{sq\_num\_pbl\_pages} & Number of outstanding tasks on the connection * 8B / page-size. \newline Suggested default value for number of outstanding tasks on the connection 256 \\ \hline
1138		\texttt{rq\_num\_entries} & Number of RQ entries, suggested value for Initiator 16 (4KB RQ), for Target 128 \\ \hline
1139		\texttt{cq\_num\_entries} & \texttt{num\_tasks} + \texttt{rq\_num\_entries} \\ \hline
1140		\texttt{cmdq\_num\_entries} & Number of CmdQ entries, suggested value \texttt{num\_tasks} \\ \hline
1141		\texttt{rq\_buffer\_log\_size} & 8 for 256B RQE \\ \hline
1142		\texttt{num\_cqs} & Number of global queues (CQ / CmdQ / RQ). This should be $\leq$ number of available MSIX vectors for the PF \\ \hline
1143		\texttt{log\_page\_size} & 12 for 4KB pages \\ \hline
1144		\texttt{mtu} & Ethernet maximum transmission unit \\ \hline
1145		\texttt{gl\_rq\_pi} & The index in the status-block for CQ completions, suggested value 0 \\ \hline
1146		\texttt{gl\_cmd\_pi} & The index in the status-block for CmdQ completions, suggested value 1 \\ \hline
1147		\end{tabular}
1148	\end{center}
1149	\item After the basic initialization process is completed successfully, it is possible to establish the LL2 queue, and send / receive LL2 packets.
1150	\item \myfunc{sp\_fcoe\_func\_start}{sp_fcoe_func_start} -- this function initializes the FCoE PF, and passes PF-global parameters to FW. This function should be called before offloading any FCoE connection.
1151	\item \myfunc{fcoe\_get\_global\_cmdq\_cons}{fcoe_get_global_cmdq_cons} -- this function returns the address in the device for updating RQ CONS for the specified queue.
1152\end{itemize}
1153
1154\section{Establish FCoE connection}
1155\silentfunc{fcoe_acquire_connection}
1158\begin{itemize}
1159	\item \myfunc{fcoe\_acquire\_connection}{fcoe_acquire_connection} -- this function allocates the resources for the connection. \texttt{p\_in\_conn} which is passed to this function should be NULL. Note that ecore allocates by itself struct \texttt{ecore\_fcoe\_conn}, and returns its pointer to the upper driver via \texttt{p\_out\_conn}. Amongst others, ecore initializes in this struct the \texttt{icid} to be used in later task initialization, and the \texttt{conn\_id} which is zero based index.
1160	\item \myfunc{fcoe\_offload\_connection}{fcoe_offload_connection} -- this function offloads the connection to the device. Before calling this function, the upper driver should allocate the connection SQ, and fill parameters in \texttt{ecore\_fcoe\_conn} struct. \\
1161	The following table describes the parameters that should be filled:
1162	\begin{center}
1163		\begin{tabular}{| l | p{10cm} |}
1164		\hline
1165		\textbf{Parameter} & \textbf{Description} \\ \hline
1166		\texttt{sq\_pbl\_addr} & The physical address of the array of pointers to the physical addresses of the SQ pages \\ \hline
1167		\texttt{sq\_curr\_page\_addr} & The physical address of the first SQ page \\ \hline
1168		\texttt{sq\_next\_page\_addr} & The physical address of the second SQ page (or the first in case of a single page SQ) \\ \hline
1169		\texttt{dst\_mac\_addr\_lo} & Remote MAC address – 2 LSB bytes \\ \hline
1170		\texttt{dst\_mac\_addr\_mid} & Remote MAC address – 2 middle bytes \\ \hline
1171		\texttt{dst\_mac\_addr\_hi} & Remote MAC address – 2 MSB bytes \\ \hline
1172		\texttt{src\_mac\_addr\_lo} & Local MAC address – 2 LSB bytes \\ \hline
1173		\texttt{src\_mac\_addr\_mid} & Local MAC address – 2 middle bytes \\ \hline
1174		\texttt{src\_mac\_addr\_hi} & Local MAC address – 2 MSB bytes \\ \hline
1175		\texttt{tx\_max\_fc\_pay\_len} & The maximum FC payload size to transmit \\ \hline
1176		\texttt{e\_d\_tov\_timer\_val} & E\_D\_TOV timeout value in 1msec resolution \\ \hline
1177		\texttt{rec\_tov\_timer\_val} & REC\_TOV timeout value in 1msec resolution \\ \hline
1178		\texttt{rx\_max\_fc\_pay\_len} & The maximum FC payload size to receive \\ \hline
1179		\texttt{vlan\_tag} & VLAN ID \\ \hline
1180		\texttt{s\_id} & FC Source ID \\ \hline
1181		\texttt{max\_conc\_seqs\_c3} & Max concurrent sequences \\ \hline
1182		\texttt{d\_id} & FC destination ID \\ \hline
1183		\texttt{flags} & INCR\_SEQ\_CNT (continuously increasing SEQ\_CNT on receive) \newline CONF\_REQ (confirmation request supported) \newline REC\_VALID (REC timeout supported) \newline VLAN\_FLAG (indicates if the VLAN ID is valid) \\ \hline
1184		\texttt{def\_q\_idx} & The desired queue number for completing un-solicited packets / commands \\ \hline
1185		\end{tabular}
1186	\end{center}
1187	After this call completes driver can post tasks to SQ.
1188	\item \myfunc{fcoe\_get\_db\_addr}{fcoe_get_db_addr} -- this function returns the address in the device for updating SQ PROD for the specified CID.
1189\end{itemize}
1190
1191\section{Close FCoE connection}
1192\silentfunc{fcoe_terminate_connection}
1193\silentfunc{fcoe_release_connection}
1194\begin{itemize}
1195	\item \myfunc{fcoe\_terminate\_connection}{fcoe_terminate_connection} -- this function removes the connection from the device. \\
1196	Before calling this function, driver needs to clean all outstanding tasks on the connection by sending cleanup requests via SQ. Drain may be needed in exceptional cases.
1197	\item \myfunc{fcoe\_release\_connection}{fcoe_release_connection} -- this function releases the resources for the connection.
1198\end{itemize}
1199
1200\section{Close FCoE PF}
1201\silentfunc{sp_fcoe_func_stop}
1202\begin{itemize}
1203	\item \myfunc{sp\_fcoe\_func\_stop}{sp_fcoe_func_stop} -- this function closes the FCoE PF. This function should be called only after all the connections on the PF were closed.
1204	\item If an LL2 queue was established, it should be closed before continuing with the de-initialization process.
1205	\item The rest of the de-initialization process is described in section \ref{sec:init-de-init} for all protocols.
1206\end{itemize}
1207
1208\section{Getting statistics}
1209\myfunc{fcoe\_get\_stats}{fcoe_get_stats} can be used to query the device for various protocol-related statistics.
1210
1211\SpillChapterFuncs
1212
1213\chapterimage{qlogic-full-36}
1214\chapter{RDMA protocol}
1215\label{cha:rdma}
1216
1217This chapter describes the ecore interface for the upper-layer driver of the RDMA protocol. The interface aims at sharing as much as possible between RoCE and iWARP. This chapter is not complete, and currently only details changes for iWARP. (Except for dcqcn which was already detailed before )
1218For iwarp support, modification to existing structures and functions names will be made to ease distinction between the two. Similar to HSI changes. The following convention will be used:
1219\begin{itemize}
1220\item ecore\_rdma\_xxx will be used for common structures and functions
1221\item ecore\_roce\_xxx will be used for roce specific structures, fields and functions
1222\item ecore\_iwarp\_xxx will be used for iwarp specific structures, fields and functions
1223\end{itemize}
1224
1225\section{Distinguish between iWARP and RoCE}
1226Ecore per function context has a field “personality” which is set based on the protocol being iWARP/FCoE/iSCSI/RoCE/Ethernet. In context of network driver, ecore personality could be ECORE\_PCI\_ETH\_ROCE, ECORE\_PCI\_IWARP, or ECORE\_PCI\_ETH. (enum ecore\_pci\_personality)
1227Ecore provides the driver the ability to set the ecore personality through the call to ecore\_hw\_prepare by passing personality as a parameter. If ‘personality’ passed in call to ecore\_hw\_prepare is ECORE\_PCI\_DEFAULT the ‘personality’ is derived from the NVRAM configuration for protocol and device capability, else the setting passed by upper driver in the call overrides the NVRAM configuration.
1228TBD: NVRAM configuration for distinguishing iWARP and RoCE does not exist and is not finalized yet.
1229
1230
1231\section{Init RDMA PF}
1232\silentfunc{rdma_start}
1234\silentfunc{rdma_query_device}
1235\silentfunc{rdma_get_sb_id}
1236\begin{itemize}
1237\item The initialization process is described in section \ref{sec:init-init} for all protocols. \\
1238	Specifically for RDMA, before calling \texttt{ecore\_resc\_alloc()}, the upper driver should determine the PF-global parameters, allocate all PF-global queues, and fill the \texttt{rdma\_pf\_params} part in struct \texttt{ecore\_pf\_params}. \\
1239	The following table describes the parameters that should be filled (the rest should be zero):
1240	\begin{center}
1241		\begin{tabular}{| l | p{10cm} |}
1242		\hline
1243		\textbf{Parameter} & \textbf{Description} \\ \hline
1244		\texttt{min\_dpis} & the minimum number of device contexts required (i.e. the number of times open device can be called simultaneously) \\ \hline
1245		\texttt{num\_qps} & requested number of queue pairs\\ \hline
1246		\texttt{edpm\_mode} & (0-Enable EDPM if BAR size is adequate, 1-Force EDPM (modprobe may fail on small BARs), 2-Disable EDPM) This field is relevant to RoCE only\\ \hline
1247		\texttt{num\_mrs} & Number of supported MRs. Up to 4K are supported, suggested default value 1K \\ \hline
1248		\texttt{gl\_pi} & The index in the status-block for CNQ completions, suggested value 3 (define named QED\_RDMA\_PROTOCOL\_INDEX exists)\\ \hline
1249		\texttt{roce\_enable\_dcqcn} & If enabled maximum number of rate limiters will be allocated during hardware initialization which can later be initialized and configured during roce start. Must be set to enabled dcqcn during roce initialization. This field is relevant to RoCE only.\\ \hline
1250		\texttt{iwarp\_port} & TCP port number to be used for the iwarp traffic\\ \hline
1251		\end{tabular}
1252	\end{center}
1253	The values of num\_qps, num\_mrs will impact the amount of memory allocated in the ILT. Note that although these parameters are rdma specific, they are actually used during common hw initialization phase. The amount of ilt memory will differ between RoCE and iWARP as iWARP requires only one cid per QP and RoCE requires two.
1254
1255	\item \myfunc{rdma\_start}{rdma_start} -- this function initializes the RDMA PF, allocates resources required for RDMA and passes PF-global parameters to FW. This function should be called before performing any other RDMA operations.
1256 The following table describes the parameters that should be passed to the function:
1257	\begin{center}
1258		\begin{tabular}{| l | p{10cm} |}
1259		\hline
1260		\textbf{Parameter} & \textbf{Description} \\ \hline
1261		\texttt{events} & RoCE - callback functions for affiliated and unaffiliated events.\\ \hline
1262		\texttt{desired\_cnq} & desired number of cnqs to be used. Upper layer driver needs to make sure enough resources are available for this number (number of msix vectors and cnq resource\\ \hline
1263		\texttt{cnq\_pbl\_list} & Array of pbls used per cnq. The array should be initialized according to the value set in desired\_cnq\\ \hline
1264		\texttt{cq\_mode} & The CQ Mode affects the CQ doorbell transaction size. 64 or 32 bit machines should configure to 32 or 16 bits respectively.\\ \hline
1265		\texttt{roce\_dcqcn\_params} & relevant only if enable\_dcqcn was initialized to true in rdma\_pf\_params. Upper level driver needs to set appropriate fields. See dcqcn section below.\\ \hline
1266		\texttt{max\_mtu} & Required for iWARP ll2. Can give ethernet mtu.\\ \hline
1267		\texttt{mac\_addr} & Required for iWARP ll2. Should be primary mac used for RDMA.\\ \hline
1268		\texttt{iwarp\_flags} & TCP related flags that can be controlled by user.
1269		\begin{tabbing}
1270			TS\_EN: Timestamp enabled \\
1271			DA\_EN: Delayed ack enabled \\
1272		\end{tabbing} \\ \hline
1273		\texttt{iwarp\_crc\_needed} & Control whether CRC should be used. \\ \hline
1274		\texttt{iwarp\_rcv\_wnd\_size} & ecore will calculate the receive window scale from this. This number should be provided in bytes. There is a minimum of 64K, any number below this will result in the default window size being set which is 1MB \\ \hline
1275		\end{tabular}
1276	\end{center}
1277	\item \myfunc{rdma\_get\_sb\_id}{rdma_get_sb_id} -- this function returns the first status block id assigned for RDMA. This is required for initializing the RDMA status blocks using the function \textit{ecore\_int\_sb\_init()}.
1278	\item \myfunc{rdma\_add\_user}{rdma_add_user} -- this function allocates a dpi index for the client.During initialization, this function should be called to allocate a reserved dpi index for the kernel.
1279	\item \myfunc{rdma\_query\_device}{rdma_query_device} -- this function returns a struct of type ecore\_rdma\_device which contains the capabilities and set options for the given device.
1280\subsection{DCQCN}
1281DCQCN is only relevant for RoCE.
1282\item Enable\_dcqcn under rdma\_pf\_params allocates additional hardware resources (rate limiters ) which can later be used to enable DCQCN notification point and reaction point. This must be set prior to calling \texttt{ecore\_resc\_alloc()}.
1283\item Additional parameters were added to ecore\_rdma\_start parameters to configure dcqcn. These will only be valid if the roce\_enable\_dcqcn in rdma\_pf\_params was set.
1284Notification point and reaction point can be enabled independently.
1285When configuring the device to act as notification point, the ecore will initialize the NIG block accordingly and pass the priority vlan and cnp send timeout values to FW. When configuring the device to act as reaction point, the ecore will send a ramrod to FW that configures the rate limiters allocated for dcqcn support with the values received from the upper layer driver ( such as maximum rate, byte counter limit, active increase rate etc... full detail in ecore\_roce\_api.h file ). At this point all rate limiters will be configured with the same values. If in the future there will be a need to configure different rate limiters with different values an additional API function will be provided. During initialization, ecore will map between physical queues used for RoCE and rate limiters. The number of rate limiters allocated is handled by resource management and is currently divided equally between the functions. During modify\_qp, ecore will configure the responder and requester to work with a unique physical queue, which is configured to work with a unique rate limiter. QPs that are opened after rate limiters are used out will be configured to run on a default physical queue which does not have a rate limiter. FW assumes that the qp\_id is equal to the physical queue id. For simplicity, the implementation assumes that Ethernet is not run simultaneously with RoCE (i.e. Roce only personality). If dcqcn is enabled and ethernet is run, ethernet will run on the same physical queue as the first qp that is allocated.
1286\end{itemize}
1287
1288\section{iWARP Connection Establishment}
1289Unlike RoCE in which connection management is implemented completely in host, connection management for iWARP which involves the TCP 3 way handshake and MPA exchanges is implemented in  F/W. The host is nevertheless involved in offloading TCP and MPA and exchanging connection parameters as part of the connection establishment/teardown process.
1290\subsection{Ecore-upper driver connection establishment/teardown API for iWARP}
1291During connection establishment/teardown, the driver calls ecore connection related APIs and receives callbacks from ecore for connection related events. The driver registers its event callbacks by passing them as parameters to the different connection ecore APIs.
1292
1293\subsection{Ecore APIs/functions for driver (downcalls)}
1294\begin{tabular}{| l | p{10cm} |}
1295	\hline
1296	\textbf{Ecore Functions} & \textbf{Description} \\ \hline
1297	\texttt{ecore\_iwarp\_connect} & Used during active connection establishment. Called to establish an iWARP connection with a peer. This is a non-blocking call. Once connection is established an async event will be sent to driver. \\ \hline
1298	\texttt{ecore\_iwarp\_create\_listen} & Used for passive connection. Called to start a listener. \\ \hline
1299	\texttt{ecore\_iwarp\_destroy\_listen} & Used for passive connection. Called to destroy a listener. \\ \hline
1300	\texttt{ecore\_iwarp\_accept} & Used during passive connection establishment. Called for accepting a previously received iWARP connection request event. i.e. MPA request event. Once connection is fully established an async event will be sent to driver. \\ \hline
1301
1302\end{tabular}
1303\subsection{Communication Management information}
1304\label{sec:cminfo}
1305For both passive and active connect, basic information on host and peer is required. We define a structure called \texttt{ecore\_iwarp\_cm\_info} which will be passed between driver and ecore on both downcalls and upcalls. Throughout the rest of the chapter we'll refer to this as the cm\_info.
1306\begin{tabular}{| l | p{10cm} |}
1307 	\hline
1308 	\textbf{Field} & \textbf{Description} \\ \hline
1309 	\texttt{ip\_version} & Type: enum ecore\_tcp\_ip\_version. Determines if ipv6 or ipv4 \\ \hline
1310 	\texttt{remote\_ip} & Ip address of remote side. In host byte order. \\ \hline
1311 	\texttt{local\_ip} & Ip address of local side. In host byte order. \\ \hline
1312 	\texttt{remote\_port} & Port of the remote side. In host byte order. \\ \hline
1313 	\texttt{local\_port} & Port of the local side. In host byte order. \\ \hline
1314 	\texttt{vlan} & vlan to be used. 0 value means no vlan. \\ \hline
1315 	\texttt{private\_data} & Used for MPA. This data will be sent on the MPA request. \\ \hline
1316 	\texttt{private\_data\_len} & Length of the private data. \\ \hline
1317 	\texttt{ord} & Negotiated in MPA Rev2. Used as is in MPA Rev1. \\ \hline
1318 	\texttt{ird} & Negotiated in MPA Rev2. Used as is in MPA Rev1. \\ \hline
1319\end{tabular}
1320Ecore sends same parameter for all different event types, though the required parameters for an event type is a subset of the complete set of parameters. It does this so that it can pass a pointer to a data structure instead of reformatting the parameters in a different structure.
1321
1322\subsection{Active side connection establishment}
1323On the active side of iWARP connection establishment, it is assumed that create\_qp will be called prior to calling connect. QP created will be sent as a parameter to the accept function call.
1324\subsubsection{ecore\_iwarp\_connect}
1325 This function will take care of initiating the TCP 3-way handshake and MPA negotiation. Once the MPA response is received the event EVENT\_ACTIVE\_COMPLETE will be issued to upper-layer driver. This function is asynchronous. The function will receive cm\_info (detailed in \ref{sec:cminfo} ), mss, local and remote mac address. The mac address will be acquired by upper-layer driver using OS ip routing functions (such as find\_route in linux). In addition, it will require a pointer to the associated QP and a pointer to a callback function and callback context which will be used to indicate events to the driver which are related to this connection. \newline
1326 \begin{tabular}{| l | p{10cm} |}
1327 	\hline
1328 	\textbf{Return Values} & \textbf{Description} \\ \hline
1329 	\texttt{ECORE\_NO\_MEM} & Memory is required for driver context of a connection. If it can't allocate it will return this failure. \\ \hline
1330 	\texttt{ECORE\_SUCCESS} & Means tcp offload was performed. Does not mean connection was established. The status of connection establishment will be passed with the EVENT\_ACTIVE\_COMPLETE. \\ \hline
1331\end{tabular}
1332\subsubsection{event callbacks related}
1333The callback received in connect call will be called with the following values after MPA response was received from peer: \newline
1334 \begin{tabular}{| l | p{10cm} |}
1335 	\hline
1336 	\textbf{Field} & \textbf{Value} \\ \hline
1337 	\texttt{event} & \texttt{ECORE\_IWARP\_EVENT\_ACTIVE\_COMPLETE} \\ \hline
1338 	\texttt{cm\_info} & \ref{sec:cminfo} with finalized values. MPA Rev2 for example will contain the negotiated ird/ord \\ \hline
1339 	\texttt{ep\_context} & Dont Care for active side \\ \hline
1340 	\texttt{status} & ECORE\_SUCCESS if connection establishment was successful. ECORE\_TIMEOUT if connection timed out, ECORE\_CONNECTION\_REFUSED if mpa\_reject was received, ECORE\_CONN\_RESET if connection establishment failed for any other reason. \\ \hline
1341 \end{tabular}
1342
1343\subsection{Passive side connection establishment}
1344The ecore will use the ll2 interface for implementing passive side connection establishment. Upper layer driver will send 2\-tuples and vlan to ecore layer which the ecore should listen on. Once a SYN packet is received on the ll2 interface, the ecore will search its database to check if a listener was registered with the received 2\-tuple and vlan. If it was received, tcp offload ramrod will be sent and once the MPA request will be received, the event EVENT\_MPA\_REQUEST will be issued to upper layer driver. At this stage it is expected that the upper layer driver will pass the MPA parameters such as private data, ord, ird to all the way to user app, which will in turn create a QP and related objects and later issue a call to ecore\_iwarp\_accept.
1345\subsubsection{iwarp\_create\_listen}
1346This function will receive socket local and remote addresses (port, ip and vlan) and add them to its listening database. In addition a callback function and callback context will be provided which will be used by ecore to send events of connection requests to the driver.
1347
1348\subsubsection{event callbacks related}
1349The callback received in listen call will be called with the following values after MPA request was received from network: \newline
1350\begin{tabular}{| l | p{10cm} |}
1351	\hline
1352	\textbf{Field} & \textbf{Value} \\ \hline
1353	\texttt{event} & \texttt{ECORE\_IWARP\_EVENT\_MPA\_REQUEST} \\ \hline
1354	\texttt{cm\_info} & \ref{sec:cminfo} with values received on the MPA request. \\ \hline
1355	\texttt{ep\_context} & Should be sent down to ecore during call to ecore\_iwarp\_accept. \\ \hline
1356	\texttt{status} & Dont Care for this event. \\ \hline
1357\end{tabular}
1358
1359\subsubsection{ecore\_iwarp\_accept}
1360This function should be called when upper layer driver wants to accept a request issued by ecore's EVENT\_MPA\_REQUEST. Qp should have been created during before calling this function. This function will send the MPA ramrod which will send a MPA response. Once the ACK on the MPA response is received the event EVENT\_PASSIVE\_COMPLETE will be sent to upper layer driver.
1361\begin{center}
1362		\begin{tabular}{| l | p{10cm} |}
1363		\hline
1364		\textbf{Param} & \textbf{Description}\\ \hline
1365		\texttt{ep\_context} & The ep\_context received in the MPA\_REQUEST event\\ \hline
1366		\texttt{cb\_context} & The same callback function passed in create\_listen will be used but with this new cb\_context.\\ \hline
1367		\texttt{private\_data} & attach to MPA frame.\\ \hline
1368		\texttt{private\_data\_len} & length of private data.\\ \hline
1369		\texttt{ord} & to send on MPA response. \\ \hline
1370		\texttt{ird} & to send on MPA response. \\ \hline
1371		\texttt{qp} & QP associated with this connection.\\ \hline
1372		\end{tabular}
1373\end{center}
1374
1375\subsubsection{ecore\_iwarp\_reject}
1376This function should be called when upper layer driver / App wants to reject a connection request for whatever reason. As a result of ecore's EVENT\_MPA\_REQUEST.
1377If a connection is rejected QP will not be associated with the connection request and remains an independent object ( if it was created ). Calling this function
1378will result in an MPA response being sent to peer with the 'reject' flag being turned on. EVENT\_PASSIVE\_COMPLETE will be sent to upper layer driver with status
1379code CONNECTION\_REFUSED.
1380\begin{center}
1381	\begin{tabular}{| l | p{10cm} |}
1382		\hline
1383		\textbf{Param} & \textbf{Description}\\ \hline
1384		\texttt{ep\_context} & The ep\_context received in the MPA\_REQUEST event\\ \hline
1385		\texttt{cb\_context} & The same callback function passed in create\_listen will be used but with this new cb\_context.\\ \hline
1386		\texttt{private\_data} & attach to MPA frame.\\ \hline
1387		\texttt{private\_data\_len} & length of private data.\\ \hline
1388	\end{tabular}
1389\end{center}
1390
1391\subsubsection{event callbacks related}
1392The callback received in listen call will be called with the following values after MPA response was acked by network/peer: \newline
1393\begin{tabular}{| l | p{10cm} |}
1394	\hline
1395	\textbf{Field} & \textbf{Value} \\ \hline
1396	\texttt{event} & \texttt{ECORE\_IWARP\_EVENT\_PASSIVE\_COMPLETE} \\ \hline
1397	\texttt{cm\_info} & \ref{sec:cminfo} with values negotiated. Dont care incase of mpa\_reject \\ \hline
1398	\texttt{ep\_context} & Dont care for this event. \\ \hline
1399	\texttt{status} & ECORE\_SUCCESS if connection establishment was successful. ECORE\_CONN\_RESET if connection establishment failed for any reason. ECORE\_CONNECTION\_REFUSED if mpa\_reject was called on the connection\\ \hline
1400\end{tabular}
1401
1402\subsubsection{ecore\_iwarp\_destroy\_listen}
1403This function will remove socket local and remote addresses (port, ip and vlan) from its listening database.
1404
1405
1406\subsection{Connection Teardown}
1407\label{sec:iwarp_teardown}
1408
1409\begin{figure}[h]
1410	\centering
1411	\includegraphics[scale=0.3]{iwarp_sm}
1412	\caption{iwarp state machine form hilland verbs}
1413	\label{fig:iwarp_sm}
1414\end{figure}
1415
1416Connection teardown is performed via the modify\_qp verb according to the hilland verbs state machine.\ref{fig:iwarp_sm}
1417The interface into ecore is done with the states of RoCE and translated internally to iwarp states. This was done
1418to utilize the same interface for RoCE and iWARP. However, in the future this may be changes so that state translation
1419is done in the upper layer driver. Translation between the states is done as follows: \newline
1420\begin{tabular}{| l | p{10cm} |}
1421	\hline
1422	\textbf{RoCE State} & \textbf{iWARP State} \\ \hline
1423	ECORE\_ROCE\_QP\_STATE\_RESET & ECORE\_IWARP\_QP\_STATE\_IDLE \\ \hline
1424	ECORE\_ROCE\_QP\_STATE\_INIT & ECORE\_IWARP\_QP\_STATE\_IDLE \\ \hline
1425	ECORE\_ROCE\_QP\_STATE\_RTR & ECORE\_IWARP\_QP\_STATE\_IDLE \\ \hline
1426	ECORE\_ROCE\_QP\_STATE\_RTS & ECORE\_IWARP\_QP\_STATE\_RTS \\ \hline
1427	ECORE\_ROCE\_QP\_STATE\_SQD & ECORE\_IWARP\_QP\_STATE\_CLOSING \\ \hline
1428	ECORE\_ROCE\_QP\_STATE\_ERR & ECORE\_IWARP\_QP\_STATE\_ERROR \\ \hline
1429	ECORE\_ROCE\_QP\_STATE\_SQE & ECORE\_IWARP\_QP\_STATE\_TERMINATE \\ \hline
1430\end{tabular}
1431
1432\subsection{Active side connection Teardown}
1433\subsubsection{Graceful disconnect}
1434To initiate a graceful disconnect sequence, the active side will perform a modify\_qp to ECORE\_ROCE\_QP\_STATE\_SQD. This will be translated to ECORE\_IWARP\_QP\_STATE\_CLOSING and initiate a graceful teardown sequence with FW. Currently, due to existing FW implementation a modify qp to error will be sent fo FW before closing the connection. In the future, FW HSI will be changed so that a CLOSING state is added to FW as well. Once the disconnect is complete, whether gracefully or abortively ( in some cases a graceful disconnect will turn into an abortive one, timeouts, errors in close etc... ) an ECORE\_IWARP\_EVENT\_CLOSE event will be sent to upper layer driver. Ecore will transition to ERROR state in any case at the end of the flow.
1435
1436\subsubsection{Abortive disconnect}
1437To initiate an abortive disconnect sequence, the active side will perform a modify\_qp to ECORE\_ROCE\_QP\_STATE\_ERR. This will be translated to ECORE\_IWARP\_QP\_STATE\_ERROR and initiate an abortive teardown sequence with FW. Once the disconnect is completed, an ECORE\_IWARP\_EVENT\_CLOSE event will be sent to upper layer driver. Ecore will transition to ERROR state in any case at the end of the flow.
1438
1439\subsubsection{event callbacks related}
1440The callback received in connect / accept call will be called with the following values after disconnect has completed: \newline
1441\begin{tabular}{| l | p{10cm} |}
1442	\hline
1443	\textbf{Field} & \textbf{Value} \\ \hline
1444	\texttt{event} & \texttt{ECORE\_IWARP\_EVENT\_CLOSE} \\ \hline
1445	\texttt{cm\_info} & Dont care. \\ \hline
1446	\texttt{ep\_context} & Dont care for this event. \\ \hline
1447	\texttt{status} & ECORE\_SUCCESS if connection was terminated gracefully ( fin ) was successful. ECORE\_CONN\_RESET if connection was terminated abortively (RST) for any reason. \\ \hline
1448\end{tabular}
1449
1450\subsection{Passive side connection Teardown}
1451On passive side teardown sequence is initiated once a graceful / abortive request is received from peer. In this case ecore will send a ECORE\_IWARP\_EVENT\_DISCONNECT to upper layer driver. Ecore will transition to ERROR state in any case at the end of the flow.
1452
1453\subsubsection{event callbacks related}
1454The callback received in connect / accept call will be called with the following values once close request was received from peer: \newline
1455\begin{tabular}{| l | p{10cm} |}
1456	\hline
1457	\textbf{Field} & \textbf{Value} \\ \hline
1458	\texttt{event} & \texttt{ECORE\_IWARP\_EVENT\_DISCONNECT} \\ \hline
1459	\texttt{cm\_info} & Dont care. \\ \hline
1460	\texttt{ep\_context} & Dont care for this event. \\ \hline
1461	\texttt{status} & ECORE\_SUCCESS if graceful disconnect was received. ECORE\_CONN\_RESET if abortive disconnect was received. \\ \hline
1462\end{tabular}
1463\\
1464Ecore will continue the disconnect flow against FW without any additional requests from upper layer driver. Ecore will call upper layer driver with the following values after disconnect has completed: \newline
1465\begin{tabular}{| l | p{10cm} |}
1466	\hline
1467	\textbf{Field} & \textbf{Value} \\ \hline
1468	\texttt{event} & \texttt{ECORE\_IWARP\_EVENT\_CLOSE} \\ \hline
1469	\texttt{cm\_info} & Dont care. \\ \hline
1470	\texttt{ep\_context} & Dont care for this event. \\ \hline
1471	\texttt{status} & ECORE\_SUCCESS if connection was terminated gracefully ( fin ) was successful. ECORE\_CONN\_RESET if connection was terminated abortively (RST) for any reason. \\ \hline
1472\end{tabular}
1473
1474\section{IB verb implementation}
1475
1476\silentfunc{rdma_alloc_pd}
1477\silentfunc{rdma_alloc_tid}
1478\silentfunc{rdma_create_qp}
1479
1480\begin{itemize}
1481	\item \myfunc{rdma\_alloc\_pd}{rdma_alloc_pd} -- This function allocates a unique protection domain id. The id is returned in the out parameter pd. (verb: Allocate Protection Domain).
1482	\item \myfunc{rdma\_alloc\_tid}{rdma_alloc_tid} -- this function allocates a unique tid (task id). The id is returned in the out parameter itid.This function also allocates required memory in the ilt array (Host memory used for hw purposes).
1483	\item \myfunc{rdma\_create\_qp}{rdma_create_qp} -- This function will create the qp object in ecore and for iWARP in FW. In RoCE no FW ramrods are sent during the call to this function. The main change from existing create\_qp function, for iWARP is that instead of providing addresses to rq, sq separately, and allocating memory for FW queues in ecore, FW requires contiguous memory for the the pbl of all FW queues (RQ, SQ, ORQ, IRQ, HQ). Therefore interface will change, and instead of upper layer driver providing pbl address to create\_qp, these will be provided as out\_parameters after being allocated in ecore. Upper layer driver will be required to pass the number of pages required for SQ / RQ. Populating the pbls will be done after calling create\_qp and not before as done today. For ease of code sharing between iWARP and RoCE FW will modify RoCE implementation to work the same as iWARP.
1484	\item \myfunc{rdma\_modify\_qp}{rdma_modify_qp} -- The API will remain the same, however, for iWARP not all fields are relevant. Naming convention of RDMA/iWARP/RoCE was done in ecore\_roce\_api to distinguish between what is required and what is not. Modify QP is used in iWARP for part of the teardown flow detailed in \ref{sec:iwarp_teardown}
1485\end{itemize}
1486
1487\section{IWARP APP TLV configuration}
1488Ecore client has the ability to signal ecore that a specific tcp port in app tlv should be recognized as pertaining to the iwarp offloaded connections. If an app tlv which matches this port is indicated by MFW, all offloaded iwarp traffic of the PF will abide by this configuration (regardless of the actual tcp port of the offloaded connections). The app tlv can be set by the ecore client via the regular APIs for setting "locally administered params”. Ecore client communicates the tcp port value via \texttt{rdma\_pf\_params} structure, the value needs to be populated before invoking  \myfunc{resc\_alloc}{resc_alloc}. To configure the iwarp app tlv in the locally administered dcbx parameters, ecore client need to use the Dcbx APIs described in "Dcbx Interface" section. The relevant APIs are \myfunc{dcbx\_get\_config\_params}{dcbx_get_config_params} and \myfunc{dcbx\_config\_params}{dcbx_config_params}.
1489
1490\SpillChapterFuncs
1491
1492\chapterimage{qlogic-full-36}
1493\chapter{LL2 (Light L2)}
1494\label{cha:ll2}
1495
1496This chapter describes the ecore interface for LL2 (Light L2). \\
1497The LL2 is a simplified version of L2 for which both slowpath and fastpath flows reside in ecore, and it is being used by the upper-layer drivers of the storage protocols.
1498
1499\section{Start LL2 connection}
1500\silentfunc{ll2_acquire_connection}
1501\silentfunc{ll2_establish_connection}
1502\begin{itemize}
1503	\item \myfunc{ll2\_acquire\_connection}{ll2_acquire_connection} -- this function allocates the resources for the LL2 connection. ll2\_acquire\_data structure that is received in this function contains the following parameters:
1504
1505	 \begin{tabular}{| l | p{10cm} |}
1506	 	\hline
1507	 	\textbf{Param} & \textbf{Description} \\ \hline
1508	 	\texttt{conn\_type} & should be set according to the protocol. \\ \hline
1509	 	\texttt{mtu} & Maximum bytes that can be placed on a BD. \\ \hline
1510	 	\texttt{rx\_num\_desc} & maximal number of entries in the Rx ring. \\ \hline
1511	 	\texttt{tx\_num\_desc} & maximal number of entries in the Tx ring (each packet-buffer occupies an entry). \\ \hline
1512	 	\texttt{rx\_num\_ooo\_buffers} & Relevant only for OOO connection if 0 default value of 2*rx\_num\_desc will be used). \\ \hline
1513	 	\texttt{rx\_drop\_ttl0\_flg} & can be set. \\ \hline
1514	 	\texttt{rx\_vlan\_removal\_en} & can be set if it is desired to get the VLAN stripped and out-of-band. \\ \hline
1515	 	\texttt{tx\_tc} & tx traffic class. 0 - regular tc, or for loopback use PURE\_LB\_TC or PKT\_LB\_TC for the rest. \\ \hline
1516	 	\texttt{tx\_dest} & Destination of tx -> Network or Loopback. \\ \hline
1517	 	\texttt{ai\_err\_packet\_too\_big} & How FW should handle packet too big error: (DROP, NOTHING, ASSERT). \\ \hline
1518	 	\texttt{ai\_err\_no\_buf} & How FW should handle no buffers error: (DROP, NOTHING, ASSERT). \\ \hline
1519	 	\texttt{gsi\_enable} & Relevant for RoCE only - is the ll2 intended to work with GSI Offload or not. \\ \hline
1520	 	\texttt{p\_connection\_handle} & Output parameter contains a handle which is used in future calls related to this LL2 connection. \\ \hline
1521	 	\texttt{cbs} & Callback functions that should be called on completion or release of rx / tx packets. \\ \hline
1522	 \end{tabular}
1523
1524
1525	\item \myfunc{ll2\_establish\_connection}{ll2_establish_connection} -- this function offloads the LL2 connection to the device (both Tx and Rx).
1526	\item After establishing the connection, it is possible to post Rx buffers and to send Tx packets.
1527\end{itemize}
1528
1530\silentfunc{ll2_post_rx_buffer}
1531\begin{itemize}
1532	\item \myfunc{ll2\_post\_rx\_buffer}{ll2_post_rx_buffer} -- this function adds the provided buffer to the receive ring. The buffer size should be at least mtu (as provided during connection start) + maximum Ethernet header size + cache line size + 4 (cache line size is typically 64 byte). \texttt{notify\_fw} should be set. \texttt{addr} should be a DMA-mapped address.
1533	\subsection{Related callback functions ( received in acquire\_connection)}
1534	\item \texttt{complete\_rx\_packet} -- this is a callback function that should be implemented in the upper driver. Ecore calls this function when a packet is received and written to a buffer in the Rx ring. \texttt{cookie} and \texttt{rx\_buf\_addr} are echoed from the call that posted that buffer. \texttt{placement\_offset} is the offset in bytes in the buffer, starting from which the packet was written. \texttt{packet\_length} is the total packet length in bytes. \texttt{opaque\_data\_0/1} and \texttt{b\_last\_packet} can be ignored. \texttt{vlan} is the VLAN tag stripped from the packet, and it is valid only if PARSING\_AND\_ERR\_FLAGS\_TAG8021QEXIST bit is set in \texttt{parse\_flags}. \texttt{parse\_flags} field contains additional flags which are mostly not interesting for the upper driver.
1535	\item \texttt{release\_rx\_packet} -- this is a callback function that should be implemented in the upper driver. Ecore calls this function when the connection is terminated and there are still buffers in the Rx ring. In this case it will call this function per each buffer, so the upper driver can free those buffers.
1536\end{itemize}
1537
1538\section{Transmit LL2 packets}
1539\silentfunc{ll2_prepare_tx_packet}
1540\silentfunc{ll2_set_fragment_of_tx_packet}
1541\begin{itemize}
1542	\item \myfunc{ll2\_prepare\_tx\_packet}{ll2_prepare_tx_packet} -- this function adds a new packet to the transmit ring. If the packet is composed from more than a single buffer, than the address and length of the additional buffers is provided to ecore by calling \texttt{ecore\_ll2\_set\_fragment\_of\_tx\_packet} for each additional buffer. \\
1543	\texttt{num\_of\_bds} is the number of buffers that compose the packet (including the first buffer), and is limited to CORE\_LL2\_TX\_LOCAL\_RING\_SIZE.
1544	\texttt{first\_frag} should be a DMA-mapped address, and \texttt{first\_frag\_len} is the buffer length in bytes. \texttt{vlan} is the VLAN tag to insert in the packet (if desired), and in this case CORE\_TX\_BD\_FLAGS\_VLAN\_INSERTION flag in \texttt{bd\_flags} should be set. \\
1545	For IP checksum and L4 checksum offload, CORE\_TX\_BD\_FLAGS\_IP\_CSUM and CORE\_TX\_BD\_FLAGS\_L4\_CSUM flags in \texttt{bd\_flags} should be set. \texttt{notify\_fw} should be set.
1546	\item \myfunc{ll2\_set\_fragment\_of\_tx\_packet}{ll2_set_fragment_of_tx_packet} -- this function provides the next buffer of a packet. \texttt{addr} should be a DMA-mapped address, and \texttt{nbytes} is the buffer length in bytes.
1547	\subsection{Related callback functions ( received in acquire\_connection)}
1548	\item \texttt{complete\_tx\_packet} -- this is a callback function that should be implemented in the upper driver. Ecore calls this function when the transmission of the packet is completed (it is called once per-packet). \texttt{cookie} and \texttt{first\_frag\_addr} are echoed from the call that posted that first fragment of the packet. \texttt{b\_last\_fragment} and \texttt{b\_last\_packet} can be ignored.
1549	\item \texttt{release\_tx\_packet} -- this is a callback function that should be implemented in the upper driver. Ecore calls this function when the connection is terminated and there are still packets in the Tx ring. In this case it will call this function per each packet, so the upper driver can free the associated buffers.
1550\end{itemize}
1551
1552\section{Stop LL2 connection}
1553\silentfunc{ll2_terminate_connection}
1554\silentfunc{ll2_release_connection}
1555\begin{itemize}
1556	\item \myfunc{ll2\_terminate\_connection}{ll2_terminate_connection} -- this function removes the LL2 connection from the device. When this function is called, ecore checks for non-completed Tx packet / Rx buffers, and calls the \texttt{release\_tx\_packet()} and / \texttt{release\_rx\_packet()} callback functions respectively.
1557	\item \myfunc{ll2\_release\_connection}{ll2_release_connection} -- this function releases the resources for the LL2 connection.
1558\end{itemize}
1559
1560\section{Getting statistics}
1561\myfunc{ll2\_get\_stats}{ll2_get_stats} can be used to query the device for various ll2-related statistics.
1562
1563\SpillChapterFuncs
1564
1565\chapterimage{qlogic-full-36}
1566\chapter{Single Root I/O Virtualization}
1567\label{cha:sriov}
1568
1569\myindex{SRIOV} is a PCIe functionality which allows Physical functions (also termed \myindex{PF}s) to spawn Virtual functions (also termed \myindex{VF}s), with a limited set of registers in their PCI configuration space and bars, but that should supply ~the same basic functionality
1570
1571SR-IOV handling is performed by the ecore on the hypervisor as well as the ecore on the VM.
1572These work hand in hand (either via HW-channel or through SW-channel) to configure the VF device for SR-IOV.
1573With some exceptions, it could be said that the upper driver doesn’t need to be aware of driving a VF instead of a PF.
1574Where on the PF side upper-layer driver accesses the ecore to send a ramrod on its behalf or perform a configuration,
1575on the VF side the upper driver will use the same API to access it’s ecore, which will in turn communicate with the PF’s ecore via the channel to perform the equivalent configuration.
1576This is an abstraction, and there are quite a few reservations and exceptions, but that is the working model.
1577
1578Sections \ref{sec:sriov-hw-channel}, \ref{sec:sriov-tlv} mostly give a glimpse of the mechanism used by the ecore to support the feature, while the rest of the sections are of more interest to the upper-driver implementer since they contain the howtos.
1579
1580Some relevant documents are \cite{doc:iov-lec}, \cite{doc:iov-sys} and \cite{doc:iov-doc}.
1581
1582\section{IOV-related fields and terminology}
1583The \textit{ecore\_dev} contains an \textit{sriov\_info} field, which is filled very early during initialization (inside \textit{ecore\_hw\_prepare()} according to pci configuration space sriov capability. Later on, this struct is read-only by the ecore.
1584Upper driver can read values in this struct [instead of accessing PCI configuration space] if needed,
1585but there are is a single field it 'owns', b\_hw\_channel -- In most distros VFs will communicate with PFs using the HW-channel [see section \ref{sec:sriov-hw-channel}], and upper-driver should set it to true'. However, if upper-driver utilizes a designated SW-channel which it can use instead of the HW-channel, it should set let this field remain 'false'. \\
1586
1587An additional important field is the \myindex{total\_vfs} which represents the maximal number of VFs current PF can possibly have. The macro \myindex{IS\_ECORE\_SRIOV} can be used to determined if PF has $\text{total\_vfs} > 0$, therefore whether IOV is relevant to the PF or not. \\
1588
1589Important terminology when talking about VFs is \myindex{relative\_vfid} versus \myindex{absolute\_vfid}. The relative vfid is the zero-based index of the VF relative to its parent PF, i.e., the first VF of a given PF is always 0, second is 1, etc.
1590The absolute vfid is the zero-based index of the VF relate to all the VFs on the same path, i.e., it's possible the first PF of a given VF will have an absolute vfid which is greater than zero.
1591
1592For most upper-driver uses, the relative vfid is the interesting index. Ecore sometimes needs to use the absolute value for configuring the FW/HW.
1593
1594\section{Initializing and Closing VFs}
1595When a PF is about to initialize its VFs, it should enable the access of each VF to the HW by calling \myfunc{iov\_init\_hw\_for\_vf}{iov_init_hw_for_vf} for each VF [passing its relative vfid].
1596\begin{NOTICE}
1597	At this point upper-layer driver has to know the number of interrupts to assign to each VF, since the MSIX table in each VF configuration space must be arranged by ecore prior to VF activation.
1598\end{NOTICE}
1599
1600Following this, upper-driver can initiate the sequence [usually via OS api] that would enable the VFs and cause them to be probed.
1601Afterwards, upper-driver can initialize the VF same as it would have the PF, i.e., the difference in initialization logic is 'hidden' inside the ecore. Upper-layer code doesn't need to contain all sorts of if-else clauses to differentiate between VF and PF [at least, not as far as the ecore initialization is concerned.
1602
1603Closing the VF should operate smoothly without need of any special calls. I.e., regular closure sequence for PFs should be translated by ecore opaquely into a sequence closing the VF.
1604
1605The only 'special' effort that should be taken is that after all the flow is done the PF's upper-driver should call \myfunc{iov\_release\_hw\_for\_vf}{iov_release_hw_for_vf}. After doing this, VF can only be re-activated by re-calling \textit{ecore\_iov\_init\_hw\_for\_vf()}.
1606
1607If the upper-driver has the option, during sriov-disable prior to VFs been unset in the PF's PCI configuration space, upper-driver should call \myfunc{iov\_set\_vf\_to\_disable}{iov_set_vf_to_disable} for each one of its active VFs. This will result with a cleaner FW/HW after closure is complete.
1608
1609\section{Message passing from VF to PF}
1610\label{sec:sriov-tlv}
1611The VF's PCI bar is very different from the PF bar, and with much more limited access toward chip; see \cite{doc:iov-sys} for details about the VF bar. As a result, most of the slowpath configuration that needs to be done for the VF actually has to be done by the PF.
1612
1613To support this, there is a mechanism of \myindex{TLV}\footnote{Type-Length-Value} message passing from VF to PF, in which the VF can request the PF to either perform services for it or supply it with information which is otherwise inaccessible for the VF.
1614This message passing is usually done via the HW channel [see section \ref{sec:sriov-hw-channel}], but assuming the existence of an alternative method [i.e., SW-channel] it can be done via it just as well.
1615
1616\begin{exampleT}
1617	During \textit{ecore\_hw\_prepare()} ecore gathers information about the chip from various locations - HW, shared memory with Management FW, etc.. However, almost all of that information is inaccessible to the VF. Thus the VF has an alternative flow by which it sends an ACQUIRE message to the PF, notifying it that it's up and requesting information about the device - e.g., number of resources such as status blocks and queues available to the VF.
1618\end{exampleT}
1619
1620Notice the upper-driver itself should never initiate such a message passing directly; All such message passing is 'hidden' inside the VF's ecore.
1621
1622\begin{exampleT}
1623	When a VF driver wants to start a vport, it calls \textit{ecore\_sp\_vport\_start()},
1624 unaware that inside the ecore this will send a VPORT\_START TLV message from VF to PF, and that the PF will open the vport for the VF as a result.
1625\end{exampleT}
1626
1627\section{HW channel}
1628\label{sec:sriov-hw-channel}
1629This is how the HW-channel operates [High level]:
1630
1631\silentfunc{iov_process_mbx_req}
1632\silentfunc{iov_copy_vf_msg}
1633\begin{enumerate}
1634	\item Prior to VF activation, PF enables VF access to the HW-channel, which actually permits it to access its PCI bar.
1635	\item VF prepares a message on DMA-able memory, which also contains an address of an additional DMA-able memory upon which the VF will poll for the PF reply.
1636	\item VF writes a trigger' containing the buffer's ghost physical address into the specific address in the USDM. It then polls until reply is received [or timeout is reached].
1637	\item This BAR access to the Ustorm RAM is trapped as an aggregated interrupt to and activates a handler in Storm FW.
1638	\item FW identifies the sending VF according to address and trigger's content and derives the parent PF's id. It then triggers an interrupt [event] on the PF, filling the event's cookie with buffer's address.
1639	\item PF driver's ISR wakes. It recognizes the message and calls OSAL\_PF\_VF\_MSG to notify upper-layer driver of the message; This is mostly since the slowpath context isn't the proper place to handle VF messages.
1640	\item Upper-layer driver should utilizes DMAE [\myfunc{iov\_copy\_vf\_msg}{iov_copy_vf_msg}] in order to copy the buffer from the VF's memory domain into it's own. [PF uses the VF's pci requestor-id for the DMAE transaction, to access the VM's Ghost Physical Address].
1641	Following that, it should schedule the proper context for handling the VF message calling \myfunc{iov\_process\_mbx\_req}{iov_process_mbx_req} to allow ecore to process the VF's message.
1642	\item ... PF processes the VF's request ...
1643	\item PF prepares an answer for the VF [success, failure, etc.] which might also contain information. It uses DMAE to copy this message to the VF's reply address [specified in the VF's message].
1644	\item VF wakes from the PF's message and processes the answer.
1645\end{enumerate}
1646
1647One optional auxilary function that can be used by the ecore-client is \myfunc{iov\_pf\_get\_pending\_events}{iov_pf_get_pending_events}. It will return a bitmask of all the VFs belonging to the PF for which there's a message yet to be processed.
1648
1649Notice that the Hw-channel is one-pending, i.e., VF cannot send an additional message until PF has notified FW that it's done processing the message.
1650
1651\begin{NOTICE}
1652	If VF will try sending an additional message, FW will mark it as malicious.
1653\end{NOTICE}
1654
1655\section{Message Passing from PF to VF}
1656The Message passing from VF to PF benefits from the PF's slowpath status-block, i.e., the ability of the PF to receive slowpath interrupts.
1657Since the VF hasn't got such a status block allocated for it, the message passing between the PF and the VF consists of polling on the VF side.
1658The VF has a buffer named \myindex{bulletin-board} into which the PF posts messages.
1659The VF would periodically poll this buffer for updates.
1660
1661\begin{exampleT}
1662	PF can use bulletin boards to notify VF of current link state. Notice that link state doesn't necessarily has to reflect the physical link state.
1663E.g., Hypervisor tools might be used to configure VF link state as always up regardless of physical state, so that VFs could communicate using Tx-Switching.
1664\end{exampleT}
1665
1666The bulletin board periodic sampling is a policy that needs to be determined and done by the upper-layer driver. It's done by calling the API function
1668If such a change occurs, since the bulletin doesn't contain deltas from previous messages but rather the entire data [due to lack of handshake the PF can't know if VF read previous bulletin boards], the upper-driver has a wide assortment of functions-per-feature which are defined in ecore\_vf\_api.h and can be used to learn of the current state. E.g., \myfunc{vf\_get\_link\_state}{vf_get_link_state},
1669\myfunc{vf\_get\_port\_mac}{vf_get_port_mac}.
1670
1671\subsection{How the bulletin board works}
1672
1673[Inner workings of the ecore; Not necessary for the upper-driver implementer]
1674
1675\begin{enumerate}
1676	\item During \textit{ecore\_hw\_prepare()} of the VF, ecore allocates a DMA-able buffer for the bulletin board.
1677	\item During the ACQUIRE message sent from VF to PF, VF posts the physical address of the bulletin board as well as its size.
1678	\item During the ACQUIRE response sent from PF to VF, PF agrees upon the size of bulletin board which will be used [forward-backward compatibility].
1679	\item Whenever any of the field of the bulletin fields the PF wants to post changes, PF increments a counter, calculate a CRC and uses DMAE to copy its local buffer into the VF's bulletin buffer.
1680	\item On the VF-side, the polled \textit{ecore\_vf\_read\_bulletin()} samples the buffer, verifies the CRC [to make sure it has a consistent image of the buffer] and if the bulletin index has increment since last seen get's updated according to the new bulletin board.
1681\end{enumerate}
1682
1683\section{Function Level Reset}
1684PCI Function Level Reset [\myindex{FLR}] is the a functionality triggered by a write to a specific [standard] bit in the PCI function configuration space, which should result in the function being reset.
1685On many OSes this feature is used to reset VFs on certain occasions, such as their physical assignment and de-assignment from VMs.
1686In addition, FLR might be used internally by driver/FW in case of malicious VFs, where that VF's database should be cleared before re-enablement.
1687
1688\begin{NOTICE}
1689	At this time, malicious VF handling does not exist in the ecore.
1690\end{NOTICE}
1691
1692The FLR flow is a complicated flow which involves Management firmware, storm firmware and driver all working on cleaning the HW and their own databases
1693[See \cite{doc:iov-sys} for more details]. From driver point-of-view, management FW notifies driver of FLR after it and the storm FW have already done some work [storm FW done what's called initial cleanup'].
1694Ecore handles the MFW messasge about FLR, and eventually notifies upper-layer driver via \myindex{OSAL\_VF\_FLR\_UPDATE} about the FLR.
1695
1696\begin{NOTICE}
1697	Again, this OSAL is mainly for allowing the upper-layer driver to move this run from the slowpath context into a different context.
1698\end{NOTICE}
1699
1700Upper-layer driver should clean whatever non-ecore volatile' information it holds for those VFs, and then call \myfunc{iov\_vf\_flr\_cleanup}{iov_vf_flr_cleanup}, which will continue the FLR process -- send a final cleanup ramrod to FW and notify MFW that the FLR process has been complete. Following this call, the FLRed VFs should be operational and in clean slate' mode.
1701
1702\begin{NOTICE}
1703	Unless \textit{ecore\_iov\_set\_vf\_to\_disable()} was called, in which case following the FLR those VFs will be disabled in FW/HW.
1704\end{NOTICE}
1705
1706\section{versioning}
1707\subsection{Slowpath versions}
1708Sr-iov is exposed to complex versioning challenges. Specifically, a given PF driver may be working with VF drivers of older and/or newer versions at the same time.
1709This means that the channel and bulletin board must be forwards and backwards compatible. The Bulletin Board achieves this by only adding new fields.
1710The Channel achieves compatibility through a TLV interface. Messages will always contain a type, length, value header, and may have multiple such parts.
1711The receiver of a message (be it PF receiving a request or VF receiving a response) will parse the message, process the parts it is aware of and be able to skip over parts which it doesn't recognize.
1712This design allows to declare messages as obsolete, modify existing messages by adding/removing modular pieces, etc.
1713
1714\subsection{Fastpath versions}
1715The compatibility requirements of fastpath flows have to be lenient, as we can't afford to penalize the performance.
1716If the fastpath API changes in a non backward compatible fashion (assumed to be a rare occurrence) the VF will either fail to load, or else have to carry with it several alternate implementations for fastpath. The VF driver learns of the fastpath version from the slowpath interaction with the PF.
1717
1718\SpillChapterFuncs
1719
1720\chapterimage{qlogic-full-36}
1721\chapter{Selftest}
1722\label{cha:Selftest}
1723
1724This chapter describes the ecore interfaces for selftests. The scope of the selftests is to sample various aspects of device functionality and verify that it is operational. It is not intended and does not lay claim to perform full coverage of any functionality. \\
1725
1726\section{Register Test}
1727	\myfunc{selftest\_register}{selftest_register} -- this test verifies the data integrity of the registers. It writes a predefined value to the register, reads it back and verifies that the contents are correctly saved. It saves the register original content before preforming the test and restores its value after the test. This test is performed via MFW and accesses registers from both engines as well as registers from engine common blocks.
1728
1729\section{Clock Test}
1730	\myfunc{selftest\_clock}{selftest_clock} -- it measures the clock frequencies of the E4 modules. The clocks verified in this test are,
1731	\begin{itemize}
1732		\item Main clock frequency
1733		\item STORM clock frequency
1734		\item NW clock frequency
1735	\end{itemize}
1736
1737\section{Interrupt Test}
1738	\myfunc{selftest\_interrupt}{selftest_interrupt} -- this test verifies the interrupt path. Ecore employs its most basic flow which exercises interrupts, the heartbeat ramrod. Ramrod is sent and interrupt is received.
1739
1740\section{Memory Test}
1741	\myfunc{selftest\_memory}{selftest_memory} -- this test samples some of the memories in the device. Ecore employs its most basic flow which exercises memories, again the heartbeat ramrod. In this flow context is loaded to the context manager memory and is verified by the storm FW (otherwise the ramrod would fail).
1742
1743\section{NVRAM Test}
1744	\myfunc{selftest\_nvram}{selftest_nvram} -- this performs the nvram test. It loops through all the nvram partitions, reads the image on the partition and validates its crc.
1745
1746\SpillChapterFuncs
1747
1748\chapterimage{qlogic-full-36}
1749\chapter{Precision Time Protocol (PTP) support}
1750\label{cha:ptp}
1751This chapter provides an high level overview of PTP and describes the ecore interfaces for the same. PTP also known as Time Sync allows the synchronization of the clocks in the distributed systems. The protocol selects one clock in the network as master clock and all other clocks (slave clocks) synchronizes their clocks with the master. Driver's responsibilities include enable/disable of the PTP feature on the device, register/un-register of the hardware clock and its operations to the OS and configure the required Rx/Tx PTP filters. HW/FW does the timestamping of Tx/Rx PTP packets, driver need to read these timestamp values and present it to upper layer protocols (e.g., IPv4). Rx timestamping will be available during the Rx interrupt processing of the driver. FW does the Tx timestamping when first byte of the PTP packet is placed on the wire, driver has to poll for the availability of this timestamp value when processing the PTP Tx packet. \\
1752\section{Ecore APIs}
1753To enable PTP support, ecore-client should call \myfunc{ptp\_enable}{ptp_enable} and then configure the required PTP filters which include
1754enabling the Tx timestamping using \myfunc{ptp\_hwtstamp\_tx\_on}{ptp_hwtstamp_tx_on} and configuring the Rx filter mode
1755using \myfunc{ptp\_cfg\_rx\_filters}{ptp_cfg_rx_filters} API.
1756Rx filter mode instructs the device to trace the configured Rx PTP packets such as L2, IPv4 etc.
1760the API \myfunc{ptp\_adjfreq}{ptp_adjfreq} provides implementation for adjusting the hardware clock by a rate given in parts-per-billion (ppb) units. \\
1761As part of feature clean up, ecore client should call \myfunc{ptp\_disable}{ptp_disable} API to disable the PTP feature on the hardware. \\
1762\begin{NOTICE}
1763It is the driver's responsibility to read the Rx/Tx timestamp values. The timestamp register will not be freed for next PTP packets until current value is read by the driver.
1764\end{NOTICE}
1765
1766\SpillChapterFuncs
1767
1768%\chapterimage{qlogic-full-36}
1769%\chapter{Statistics}
1770%\begin{NOTICE}
1771%Placeholder - owner Dmitry
1772%\end{NOTICE}
1773
1774%\chapterimage{qlogic-full-36}
1775%\chapter{Peripherals}
1776%\begin{NOTICE}
1777%Placeholder - owner Yuval
1778%\end{NOTICE}
1779
1780\appendix
1781\chapter{Osal Documentation}
1782\label{app:osal}
1783\verbatiminput{osal.txt}
1784
1785%----------------------------------------------------------------------------------------
1786%	INDEX
1787%----------------------------------------------------------------------------------------
1788
1789\cleardoublepage
1790\setlength{\columnsep}{0.75cm}
1797