1*14b24e2bSVaishali Kulkarni%----------------------------------------------------------------------------------------
2*14b24e2bSVaishali Kulkarni%	PACKAGES AND OTHER DOCUMENT CONFIGURATIONS
3*14b24e2bSVaishali Kulkarni%----------------------------------------------------------------------------------------
4*14b24e2bSVaishali Kulkarni
5*14b24e2bSVaishali Kulkarni\documentclass[11pt,fleqn,hidelinks,oneside]{book} % Default font size and left-justified equations
6*14b24e2bSVaishali Kulkarni\usepackage[nottoc,notlot,notlof]{tocbibind}
7*14b24e2bSVaishali Kulkarni\makeindex % Tells LaTeX to create the files required for indexing
8*14b24e2bSVaishali Kulkarni%----------------------------------------------------------------------------------------
9*14b24e2bSVaishali Kulkarni
10*14b24e2bSVaishali Kulkarni% Create a command to cleanly insert a snippet with the style above anywhere in the document
11*14b24e2bSVaishali Kulkarni\newcommand{\insertcode}[2]{\begin{itemize}\item[]\lstinputlisting[caption=#2,label=#1,style=Style1,float=h!]{#1}\end{itemize}} % The first argument is the script location/filename and the second is a caption for the listing
12*14b24e2bSVaishali Kulkarni
13*14b24e2bSVaishali Kulkarni\newcommand{\myref}[1]
14*14b24e2bSVaishali Kulkarni	{\textcolor{blue}{[\ref{#1}]}}
15*14b24e2bSVaishali Kulkarni
16*14b24e2bSVaishali Kulkarni\newcommand{\myindex}[1]
17*14b24e2bSVaishali Kulkarni	{\index{#1@\texttt{#1}}#1}
18*14b24e2bSVaishali Kulkarni
19*14b24e2bSVaishali Kulkarni\newcommand{\ChapterFuncs}{}
20*14b24e2bSVaishali Kulkarni
21*14b24e2bSVaishali Kulkarni%Fpr some reason, this doesn't work inside \item so we can't have this as part of \myfunc
22*14b24e2bSVaishali Kulkarni\newcommand{\silentfunc}[1]
23*14b24e2bSVaishali Kulkarni{\expandafter\def\expandafter\ChapterFuncs\expandafter{\ChapterFuncs { } \insertcode{snippets/#1_generated.h}{}}}
24*14b24e2bSVaishali Kulkarni
25*14b24e2bSVaishali Kulkarni\newcommand{\myfunc}[2]
26*14b24e2bSVaishali Kulkarni{\index{ZZZ@API Function!ecore\_#1@\texttt{ecore\_#1}}%
27*14b24e2bSVaishali Kulkarni\silentfunc{#2}\texttt{ecore\_#1()}}
28*14b24e2bSVaishali Kulkarni
29*14b24e2bSVaishali Kulkarni\newenvironment{bottompar}{\par\vspace*{\fill}}{\clearpage}
30*14b24e2bSVaishali Kulkarni
31*14b24e2bSVaishali Kulkarni\newcommand{\SpillChapterFuncs}%
32*14b24e2bSVaishali Kulkarni%{\begin{bottompar}
33*14b24e2bSVaishali Kulkarni{%
34*14b24e2bSVaishali Kulkarni%\texttt{\textbf{\\API functions in this chapter: \\}}%
35*14b24e2bSVaishali Kulkarni\section{API functions discussed in this chapter}
36*14b24e2bSVaishali Kulkarni\ChapterFuncs{}%
37*14b24e2bSVaishali Kulkarni%\end{bottompar}%
38*14b24e2bSVaishali Kulkarni\renewcommand{\ChapterFuncs}{}}
39*14b24e2bSVaishali Kulkarni
40*14b24e2bSVaishali Kulkarni%----------------------------------------------------------------------------------------
41*14b24e2bSVaishali Kulkarni
42*14b24e2bSVaishali Kulkarni\input{structure} % Insert the commands.tex file which contains the majority of the structure behind the template
43*14b24e2bSVaishali Kulkarni
44*14b24e2bSVaishali Kulkarni%\lstset{belowskip=-20pt plus 2pt}
45*14b24e2bSVaishali Kulkarni\lstset{belowskip=\smallskipamount,aboveskip=\smallskipamount,boxpos=h!,float=h!}
46*14b24e2bSVaishali Kulkarni\makeatletter
47*14b24e2bSVaishali Kulkarni\setlength{\@fptop}{5pt}
48*14b24e2bSVaishali Kulkarni\makeatother
49*14b24e2bSVaishali Kulkarni
50*14b24e2bSVaishali Kulkarni
51*14b24e2bSVaishali Kulkarni\usepackage{hyperref}
52*14b24e2bSVaishali Kulkarni\usepackage{verbatim}
53*14b24e2bSVaishali Kulkarni
54*14b24e2bSVaishali Kulkarni%Macros
55*14b24e2bSVaishali Kulkarni\newcommand{\mlist}[1]{\begin{itemize}{#1}\end{itemize}}
56*14b24e2bSVaishali Kulkarni\newcommand{\mlisti}[2]{\item {\textcolor{red}{#1} -- #2}}
57*14b24e2bSVaishali Kulkarni
58*14b24e2bSVaishali Kulkarni\long\def\greybox#1{%
59*14b24e2bSVaishali Kulkarni    \newbox\contentbox%
60*14b24e2bSVaishali Kulkarni    \newbox\bkgdbox%
61*14b24e2bSVaishali Kulkarni    \setbox\contentbox\hbox to \hsize{%
62*14b24e2bSVaishali Kulkarni        \vtop{
63*14b24e2bSVaishali Kulkarni            \kern\columnsep
64*14b24e2bSVaishali Kulkarni            \hbox to \hsize{%
65*14b24e2bSVaishali Kulkarni                \kern\columnsep%
66*14b24e2bSVaishali Kulkarni                \advance\hsize by -2\columnsep%
67*14b24e2bSVaishali Kulkarni                \setlength{\textwidth}{\hsize}%
68*14b24e2bSVaishali Kulkarni                \vbox{
69*14b24e2bSVaishali Kulkarni                    \parskip=\baselineskip
70*14b24e2bSVaishali Kulkarni                    \parindent=0bp
71*14b24e2bSVaishali Kulkarni                    #1
72*14b24e2bSVaishali Kulkarni                }%
73*14b24e2bSVaishali Kulkarni                \kern\columnsep%
74*14b24e2bSVaishali Kulkarni            }%
75*14b24e2bSVaishali Kulkarni            \kern\columnsep%
76*14b24e2bSVaishali Kulkarni        }%
77*14b24e2bSVaishali Kulkarni    }%
78*14b24e2bSVaishali Kulkarni    \setbox\bkgdbox\vbox{
79*14b24e2bSVaishali Kulkarni        \pdfliteral{0.75 0.75 0.75 rg}
80*14b24e2bSVaishali Kulkarni        \hrule width  \wd\contentbox %
81*14b24e2bSVaishali Kulkarni               height \ht\contentbox %
82*14b24e2bSVaishali Kulkarni               depth  \dp\contentbox
83*14b24e2bSVaishali Kulkarni        \pdfliteral{0 0 0 rg}
84*14b24e2bSVaishali Kulkarni    }%
85*14b24e2bSVaishali Kulkarni    \wd\bkgdbox=0bp%
86*14b24e2bSVaishali Kulkarni    \vbox{\hbox to \hsize{\box\bkgdbox\box\contentbox}}%
87*14b24e2bSVaishali Kulkarni    \vskip\baselineskip%
88*14b24e2bSVaishali Kulkarni}
89*14b24e2bSVaishali Kulkarni
90*14b24e2bSVaishali Kulkarni\newcommand{\greycom}[2]{\greybox{\textcolor{red}{#1} -- #2}}
91*14b24e2bSVaishali Kulkarni
92*14b24e2bSVaishali Kulkarni
93*14b24e2bSVaishali Kulkarni\global \mdfdefinestyle{MyMdStyle}{%
94*14b24e2bSVaishali Kulkarni	linecolor=black, linewidth=1,%
95*14b24e2bSVaishali Kulkarni	outerlinecolor=red,outerlinewidth=2pt,%
96*14b24e2bSVaishali Kulkarni	roundcorner=5pt,backgroundcolor=brown!10,nobreak=true}
97*14b24e2bSVaishali Kulkarni
98*14b24e2bSVaishali Kulkarni\newenvironment{warning}
99*14b24e2bSVaishali Kulkarni	{\par\begin{mdframed}[style=MyMdStyle] \begin{Warning}}
100*14b24e2bSVaishali Kulkarni	{\end{Warning}\end{mdframed}\vspace{5pt}\par}
101*14b24e2bSVaishali Kulkarni
102*14b24e2bSVaishali Kulkarni\newcommand{\HRule}{\rule{\linewidth}{0.5mm}}
103*14b24e2bSVaishali Kulkarni\newenvironment{TBD}
104*14b24e2bSVaishali Kulkarni	{\par\vspace{3pt}\begin{mdframed}[style=MyMdStyle,outerlinecolor=blue,%
105*14b24e2bSVaishali Kulkarni									  backgroundcolor=blue!10]%
106*14b24e2bSVaishali Kulkarni		\begin{question}}
107*14b24e2bSVaishali Kulkarni  {\end{question}\end{mdframed}\par}
108*14b24e2bSVaishali Kulkarni
109*14b24e2bSVaishali Kulkarni\newenvironment{NOTICE}
110*14b24e2bSVaishali Kulkarni  {\par\begin{mdframed}[style=MyMdStyle,outerlinecolor=black,%
111*14b24e2bSVaishali Kulkarni  						linecolor=black, outerlinewidth=1.5pt]%
112*14b24e2bSVaishali Kulkarni    \begin{itemize}{}{\leftmargin=1cm
113*14b24e2bSVaishali Kulkarni                   \labelwidth=\leftmargin}\item[\Large\Info]}
114*14b24e2bSVaishali Kulkarni  {\end{itemize}\end{mdframed}\par}
115*14b24e2bSVaishali Kulkarni
116*14b24e2bSVaishali Kulkarni\newenvironment{REMINDER}
117*14b24e2bSVaishali Kulkarni	{\par\begin{mdframed}[style=MyMdStyle,outerlinecolor=blue,%
118*14b24e2bSVaishali Kulkarni  						  linecolor=blue, outerlinewidth=2pt]%
119*14b24e2bSVaishali Kulkarni		\begin{reminder}}
120*14b24e2bSVaishali Kulkarni  {\end{reminder}\end{mdframed}\par}
121*14b24e2bSVaishali Kulkarni
122*14b24e2bSVaishali Kulkarni\bibliographystyle{plain}
123*14b24e2bSVaishali Kulkarni
124*14b24e2bSVaishali Kulkarni\begin{document}
125*14b24e2bSVaishali Kulkarni
126*14b24e2bSVaishali Kulkarni\begin{titlepage}
127*14b24e2bSVaishali Kulkarni\begin{center}
128*14b24e2bSVaishali Kulkarni
129*14b24e2bSVaishali Kulkarni% Upper part of the page. The '~' is needed because \\
130*14b24e2bSVaishali Kulkarni% only works if a paragraph has started.
131*14b24e2bSVaishali Kulkarni\includegraphics[width=0.5\textwidth]{./qlogic-logo}~\\[3cm]
132*14b24e2bSVaishali Kulkarni
133*14b24e2bSVaishali Kulkarni% Title
134*14b24e2bSVaishali Kulkarni\HRule \\[0.4cm]
135*14b24e2bSVaishali Kulkarni{ \huge \bfseries E4 ecore \\[0.4cm] }
136*14b24e2bSVaishali Kulkarni
137*14b24e2bSVaishali Kulkarni\HRule \\[1.5cm]
138*14b24e2bSVaishali Kulkarni
139*14b24e2bSVaishali Kulkarni\begin{minipage}{0.4\textwidth}
140*14b24e2bSVaishali Kulkarni\begin{flushleft} \large
141*14b24e2bSVaishali Kulkarni\emph{Authors:}\\
142*14b24e2bSVaishali KulkarniAriel \textsc{Elior} \\
143*14b24e2bSVaishali KulkarniMichal \textsc{Kalderon} \\
144*14b24e2bSVaishali KulkarniYuval \textsc{Mintz} \\
145*14b24e2bSVaishali KulkarniMerav \textsc{Sicron} \\
146*14b24e2bSVaishali KulkarniTomer \textsc{Tayar} \\
147*14b24e2bSVaishali KulkarniSudarsana Reddy \textsc{Kalluru} \\
148*14b24e2bSVaishali Kulkarni\end{flushleft}
149*14b24e2bSVaishali Kulkarni\end{minipage}
150*14b24e2bSVaishali Kulkarni\begin{minipage}{0.4\textwidth}
151*14b24e2bSVaishali Kulkarni\begin{flushright} \large
152*14b24e2bSVaishali Kulkarni\emph{Version:} \\
153*14b24e2bSVaishali Kulkarni0.0.10
154*14b24e2bSVaishali Kulkarni\end{flushright}
155*14b24e2bSVaishali Kulkarni\end{minipage}
156*14b24e2bSVaishali Kulkarni
157*14b24e2bSVaishali Kulkarni\vfill
158*14b24e2bSVaishali Kulkarni
159*14b24e2bSVaishali Kulkarni% Bottom of the page
160*14b24e2bSVaishali Kulkarni{\large \today}
161*14b24e2bSVaishali Kulkarni
162*14b24e2bSVaishali Kulkarni\end{center}
163*14b24e2bSVaishali Kulkarni\end{titlepage}
164*14b24e2bSVaishali Kulkarni
165*14b24e2bSVaishali Kulkarni\pagestyle{empty} % No headers
166*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
167*14b24e2bSVaishali Kulkarni\tableofcontents % Print the table of contents itself
168*14b24e2bSVaishali Kulkarni
169*14b24e2bSVaishali Kulkarni\cleardoublepage % Forces the first chapter to start on an odd page so it's on the right
170*14b24e2bSVaishali Kulkarni
171*14b24e2bSVaishali Kulkarni\pagestyle{fancy} % Print headers again
172*14b24e2bSVaishali Kulkarni
173*14b24e2bSVaishali Kulkarni
174*14b24e2bSVaishali Kulkarni%----------------------------------------------------------------------------------------
175*14b24e2bSVaishali Kulkarni%	Real Content
176*14b24e2bSVaishali Kulkarni%----------------------------------------------------------------------------------------
177*14b24e2bSVaishali Kulkarni\chapterimage{pictures/qlogic-full-36.jpg}
178*14b24e2bSVaishali Kulkarni\chapter{Introduction}
179*14b24e2bSVaishali KulkarniBy definition, a driver is the entity which allows an OS to drive a hardware device.
180*14b24e2bSVaishali KulkarniAs such the driver contains both device-specific parts and OS-specific parts.
181*14b24e2bSVaishali KulkarniThe Everest architecture, with programmable fastpath processors (Storms), host-based device-dedicated memory (ILT), and minimal on-chip management presents a device which requires a driver with significant portions of device-specific code.
182*14b24e2bSVaishali Kulkarni
183*14b24e2bSVaishali KulkarniDrivers will be implemented for Everest 4 devices in many OSs (linux, windows, freebsd, solaris, esx, aix, hpux…).
184*14b24e2bSVaishali KulkarniImplementing the device-specific code again and again in each OS is both wasteful and difficult to maintain.
185*14b24e2bSVaishali KulkarniFor this purpose the ecore was conceived.
186*14b24e2bSVaishali KulkarniA large mass of code for operating and interacting with the Everest 4 device, to be incorporated into and used by OS drivers.
187*14b24e2bSVaishali Kulkarni
188*14b24e2bSVaishali KulkarniIn the abstract, the ecore is a layer between the HW/FW and the OS.
189*14b24e2bSVaishali KulkarniIt is device-specific and OS-agnostic. When ecore code requires OS services (e.g. memory allocation, pci configuration space access, etc.) it calls an abstract OS function for that purpose. These are implemented in OS-specific layers.
190*14b24e2bSVaishali KulkarniEcore flows may be driven by the HW (e.g. by an interrupt) or by the OS specific portion of the driver (e.g. driver load/unload).
191*14b24e2bSVaishali Kulkarni
192*14b24e2bSVaishali Kulkarni\begin{itemize}
193*14b24e2bSVaishali Kulkarni
194*14b24e2bSVaishali Kulkarni	\item Slowpath flows tend to reside largely in ecore and less so in OS specific layers. As much of the functionality as possible is placed in the ecore to leverage it across multiple platforms. \\
195*14b24e2bSVaishali Kulkarni
196*14b24e2bSVaishali Kulkarni	\item Fastpath flows tend to be in the OS specific layer as too much layering and abstraction is out of place in fastpath.
197*14b24e2bSVaishali KulkarniHowever, the fastpath would usually be set up by ecore flows, for example the address where transmission flow should write a doorbell to the BAR is determined by the ecore at init phase and this address is supplied by ecore to the OS specific layer. \\
198*14b24e2bSVaishali Kulkarni
199*14b24e2bSVaishali Kulkarni\end{itemize}
200*14b24e2bSVaishali Kulkarni
201*14b24e2bSVaishali KulkarniDifferent drivers in the same OS may have the ecore within them, and may use it for similar or different purposes:
202*14b24e2bSVaishali Kulkarni
203*14b24e2bSVaishali Kulkarni\begin{exampleT}
204*14b24e2bSVaishali Kulkarni	In linux there will be an ethernet driver, an fcoe driver, an iscsi driver, a roce driver and also a slim driver for the diag utility.
205*14b24e2bSVaishali Kulkarni	All of these may exists in the same system.
206*14b24e2bSVaishali Kulkarni	All of these will have an ecore instance incorporated in them.
207*14b24e2bSVaishali Kulkarni	Either one of the drivers might use the ecore to initialize the device, or the sections of the device pertaining to that driver’s operation.
208*14b24e2bSVaishali Kulkarni	A storage driver may use the ecore for storage specific purposes, such as the initialization and allocation of task context.
209*14b24e2bSVaishali Kulkarni\end{exampleT}
210*14b24e2bSVaishali Kulkarni
211*14b24e2bSVaishali KulkarniThe ecore is not a driver in its own capacity, but only code which is used by other drivers. Thus, separate drivers, including separate instances of the same driver within an OS, have separate instances of the ecore within them, which are concurrently active.
212*14b24e2bSVaishali Kulkarni
213*14b24e2bSVaishali Kulkarni\section{scope}
214*14b24e2bSVaishali KulkarniThis document strives to define and detail what is the ecore.
215*14b24e2bSVaishali KulkarniThe first parts of the document deal with the concept of the ecore, and its place in the software layers between the device and the OS.
216*14b24e2bSVaishali KulkarniThe rest of the document deals with the content of the ecore.
217*14b24e2bSVaishali KulkarniThis document does not deal with the needs and use cases of any specific OS or tool, but only with the common ground which is the ecore.
218*14b24e2bSVaishali Kulkarni
219*14b24e2bSVaishali KulkarniThe document sometimes delves in-depth into the inner-workings of the ecore; Since the programmer coming to utilize the ecore might not need [or want] to know those inner workings, such a person should look into specific sections in each chapter, specifically:
220*14b24e2bSVaishali Kulkarni\begin{enumerate}
221*14b24e2bSVaishali Kulkarni	\item Chapter \ref{cha:overview}'s introduction and section \ref{sec:overview-api} for a listing of the ecore API files and their locations.
222*14b24e2bSVaishali Kulkarni
223*14b24e2bSVaishali Kulkarni	\item OS abstraction layer [\ref{sec:osal}] for functions needed to be implemented by upper-layer driver in order to support the ecore.
224*14b24e2bSVaishali Kulkarni
225*14b24e2bSVaishali Kulkarni	\item Register-access [\ref{cha:reg}], mainly for learning about PTTs which are required by various ecore API functions.
226*14b24e2bSVaishali Kulkarni
227*14b24e2bSVaishali Kulkarni	\item Initialization and De-initialization of the HW [ \ref{sec:init-init}, \ref{sec:init-de-init}].
228*14b24e2bSVaishali Kulkarni
229*14b24e2bSVaishali Kulkarni	\item Status block initialization [\ref{ssec:sb-init}] and Interrupt handling flow [\ref{sec:sb-flow}].
230*14b24e2bSVaishali Kulkarni
231*14b24e2bSVaishali Kulkarni	\item Link interface [\ref{sec:mfw-link}].
232*14b24e2bSVaishali Kulkarni
233*14b24e2bSVaishali Kulkarni	\item Protocol related initialization/de-initialization:
234*14b24e2bSVaishali Kulkarni	\begin{enumerate}
235*14b24e2bSVaishali Kulkarni		\item L2-related, see Chapter [\ref{cha:l2}].
236*14b24e2bSVaishali Kulkarni	\end{enumerate}
237*14b24e2bSVaishali Kulkarni\end{enumerate}
238*14b24e2bSVaishali Kulkarni
239*14b24e2bSVaishali KulkarniIn addition, each chapter which includes ecore API functions that can be called by the upper-layer driver lists those functions' prototypes at its end.
240*14b24e2bSVaishali Kulkarni
241*14b24e2bSVaishali Kulkarni%\bibliography{ecore}
242*14b24e2bSVaishali Kulkarni
243*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
244*14b24e2bSVaishali Kulkarni\chapter{Ecore interface overview}
245*14b24e2bSVaishali Kulkarni\label{cha:overview}
246*14b24e2bSVaishali KulkarniThe ecore can be found at the perforce servers under:
247*14b24e2bSVaishali Kulkarni\begin{center}
248*14b24e2bSVaishali Kulkarni	//servers/main/nx2/579xx/drivers/ecore
249*14b24e2bSVaishali Kulkarni\end{center}
250*14b24e2bSVaishali Kulkarni
251*14b24e2bSVaishali KulkarniMost of the ecore consists of the \textit{inner} parts, i.e., HW-oriented implementation to which the upper-layer driver writer is oblivious.
252*14b24e2bSVaishali KulkarniAbove that is a concise API layer, through which the upper-layer driver should manipulate the ecore code.
253*14b24e2bSVaishali Kulkarni
254*14b24e2bSVaishali Kulkarni\section{Ecore API}
255*14b24e2bSVaishali Kulkarni\label{sec:overview-api}
256*14b24e2bSVaishali KulkarniThe Ecore API contains two types of files:
257*14b24e2bSVaishali Kulkarni\begin{enumerate}
258*14b24e2bSVaishali Kulkarni	\item Files of the format \texttt{ecore\_<module>\_api.h} -- these files are the SW API between the ecore and the upper-layer driver:
259*14b24e2bSVaishali Kulkarni	\begin{enumerate}
260*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_cxt\_api.h}.
261*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_dev\_api.h}.
262*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_fcoe\_api.h}.
263*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_int\_api.h}.
264*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_iov\_api.h}.
265*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_iscsi\_api.h}.
266*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_ll2\_api.h}.
267*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_roce\_api.h}.
268*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_sp\_api.h}.
269*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_vf\_api.h}.
270*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_mcp\_api.h}.
271*14b24e2bSVaishali Kulkarni	\end{enumerate}
272*14b24e2bSVaishali Kulkarni	\item Files of the format \texttt{ecore\_hsi\_<protocol>.h} -- these files contain the API between FW/HW and the the ecore/upper-layer driver:
273*14b24e2bSVaishali Kulkarni	\begin{enumerate}
274*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_hsi\_common.h}.
275*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_hsi\_eth.h}.
276*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_hsi\_fcoe.h}.
277*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_hsi\_iscsi.h}.
278*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_hsi\_roce.h}.
279*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_hsi\_tcp.h}.
280*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_hsi\_toe.h}.
281*14b24e2bSVaishali Kulkarni	\end{enumerate}
282*14b24e2bSVaishali Kulkarni\end{enumerate}
283*14b24e2bSVaishali KulkarniUpper-layer driver should not include any other ecore header file, as the rest of the header files are internal, with the following exceptions:
284*14b24e2bSVaishali Kulkarni\begin{itemize}
285*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_chain.h} -- Networking drivers will probably want to include this to benefit from the already-implemented chain.
286*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_utils.h} -- Useful macros which can be used by upper-layer driver.
287*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_status.h} -- contains \texttt{enum \_ecore\_status\_t}. Many of the ecore return values are of this type.
288*14b24e2bSVaishali Kulkarni\end{itemize}
289*14b24e2bSVaishali Kulkarni
290*14b24e2bSVaishali Kulkarni\begin{warning}
291*14b24e2bSVaishali KulkarniCurrently \texttt{ecore.h, ecore\_proto\_if.h} should also be included by upper-layer driver; This will (hopefully) be fixed shortly.
292*14b24e2bSVaishali Kulkarni\end{warning}
293*14b24e2bSVaishali Kulkarni
294*14b24e2bSVaishali Kulkarni
295*14b24e2bSVaishali Kulkarni\section{Ecore Internal files}
296*14b24e2bSVaishali KulkarniThis lists the ecore files, giving each a short description:
297*14b24e2bSVaishali Kulkarni
298*14b24e2bSVaishali Kulkarni\begin{itemize}
299*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_attn\_values.h}
300*14b24e2bSVaishali Kulkarni
301*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_chain.h} -- Implements a cyclic chain; Used for various interfaces with the FW [Buffer-Descriptoss, Event Queues, etc.].
302*14b24e2bSVaishali Kulkarni
303*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_cxt\_api.[ch]} -- Handles the allocation, configuration and distribution of contexts to the various clients.
304*14b24e2bSVaishali Kulkarni
305*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_dbg\_fw\_funcs.[ch], ecore\_dbg\_values.h, ecore\_fw\_defs} -- Files which contain code related for various debug features ecore can provide [e.g., grcDump].
306*14b24e2bSVaishali Kulkarni
307*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_fcoe.[ch], ecore\_iscsi.[ch], ecore\_ll2.[ch], ecore\_roce.[ch]} -- files containing specific ecore code for the storage protocols.
308*14b24e2bSVaishali Kulkarni
309*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_dev.[ch]} -- Contains much of the functionality of starting/stopping the hardware. See chapter \ref{cha:hwinit}.
310*14b24e2bSVaishali Kulkarni
311*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_hw.[ch], ecore\_gtt\_reg\_addr.h, ecore\_gtt\_values.h} -- contains the functionality for register access and DMAE. See chapter \ref{cha:reg}.
312*14b24e2bSVaishali Kulkarni
313*14b24e2bSVaishali Kulkarni	\item \texttt{ecore.h} -- contains the defintion of the most \textit{elementary} structures in the ecore, the \texttt{ecore\_dev} and the \texttt{ecore\_hwfn}.
314*14b24e2bSVaishali Kulkarni
315*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_init\_defs.h, ecore\_init\_fw\_funcs.[ch], ecore\_init\_ops.[ch], \\ ecore\_init\_values.h, ecore\_rt\_defs} -- Code responsible for initialization and configuration of the HW and loading of the FW, mostly in relation with the init-tool. See chapter \ref{cha:hwinit}.
316*14b24e2bSVaishali Kulkarni	\begin{REMINDER}
317*14b24e2bSVaishali Kulkarni			Chapter \ref{cha:hwinit} doesn't really give a thorough explanation of the init tool - at most it mentions it. Do we want a section/chapter of it somewhere?
318*14b24e2bSVaishali Kulkarni	\end{REMINDER}
319*14b24e2bSVaishali Kulkarni
320*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_int.[ch]} -- Handles interrupts and attentions. See chapter \ref{cha:int}.
321*14b24e2bSVaishali Kulkarni
322*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_iro.h, ecore\_iro\_values.h} -- Generated FW files. Enables ecore to access [or supply to upper-layer] addresses inside the \texttt{storm}'s RAM.
323*14b24e2bSVaishali Kulkarni
324*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_mcp.[ch]} -- Contains the interface between the ecore and the MFW. See chapter \ref{cha:mfw}.
325*14b24e2bSVaishali Kulkarni
326*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_sp\_commands.[ch], ecore\_spq.[ch]} -- Contained the slowpath logic required for sending ramrods and configuring \& handling the various slowpath events.
327*14b24e2bSVaishali Kulkarni
328*14b24e2bSVaishali Kulkarni	\item \texttt{ecore\_sriov.[ch], ecore\_vf.[ch], ecore\_vfpf\_if.h} -- Contains the SRIOV implementation both from the PF and VF sides.
329*14b24e2bSVaishali Kulkarni\end{itemize}
330*14b24e2bSVaishali Kulkarni
331*14b24e2bSVaishali Kulkarni\section{OS abstraction Layer}
332*14b24e2bSVaishali Kulkarni\label{sec:osal}
333*14b24e2bSVaishali Kulkarni
334*14b24e2bSVaishali Kulkarni%\section{Driver Core}
335*14b24e2bSVaishali Kulkarni%As the ecore contains most of the lowlevel code operating the non-fastpath parts of the working with the HW and FW, it can be thought of as some sort of library – it contains bits of code meant to be operated from an outside source. Each OS needs to implement its own driver, calling the various functions in the ecore API in a place fitting for that OS driver flows.
336*14b24e2bSVaishali Kulkarni%Each OS will independently need to create a driver that incorporates the ecore, both filling the OS dependent callbacks required by the ecore to perform and supply an upper level of abstraction which best suits that OS. Notice this upper layer is sometimes also, mistakenly, referred to as ecore [e.g., bnx2c for linux drivers] but there’s an important distinction:
337*14b24e2bSVaishali Kulkarni%\begin{itemize}
338*14b24e2bSVaishali Kulkarni%	\item Ecore – shared code between ALL operating systems.
339*14b24e2bSVaishali Kulkarni%	\item Upper-Layer – shared code by all drivers on a single operating system.
340*14b24e2bSVaishali Kulkarni%\end{itemize}
341*14b24e2bSVaishali Kulkarni
342*14b24e2bSVaishali Kulkarni%It’s possible [and likely] that an operating system will break the various protocols into different sub-drivers, where each sub-driver will be designated for a specific protocol. Notice that if such separation is made, the preferred implementation is that the OS will implement a ‘core’ driver consisting of the Ecore and an upper-layer, and define an API through which the various protocol drivers communicate with the OS core driver\footnote{Although notice there should be no inter-dependencies between HW-functions in the ecore, so the alternative method where each contains the ecore is also feasible}.
343*14b24e2bSVaishali Kulkarni
344*14b24e2bSVaishali KulkarniThe ecore utilizes various functions which should be implemented by the upper layer. There are two main ‘types’ of functions:
345*14b24e2bSVaishali Kulkarni\begin{enumerate}
346*14b24e2bSVaishali Kulkarni	\item Basic OS-specific operations that the ecore needs in order to perform it’s work; e.g., memory allocations – the ecore needs to allocate memory for various reasons, and it needs the upper layer to supply the method by which it can do so.
347*14b24e2bSVaishali Kulkarni	\item Hooks by which the upper-layer can run additional OS specific code, or make decisions affecting the work of the ecore. E.g., in the SRIOV flows, the mechanism for passing messages from VF to PF is implemented in the ecore but the decision whether a request is valid or not might be OS specific – as in the case of unicast filters.
348*14b24e2bSVaishali Kulkarni\end{enumerate}
349*14b24e2bSVaishali Kulkarni
350*14b24e2bSVaishali KulkarniThe various functions that need to be implemented by the upper-layer can be found in Appendix \ref{app:osal} -- OSAL Documentation.
351*14b24e2bSVaishali Kulkarni
352*14b24e2bSVaishali Kulkarni
353*14b24e2bSVaishali Kulkarni\section{Ecore print scheme}
354*14b24e2bSVaishali KulkarniThe ecore utilizes several printing methods to print messages to the system logs; It requires some functions to be implemented by the upper-layer for this to work – the required documentation can be found in Appendix \ref{app:osal} -- OSAL Documentation.
355*14b24e2bSVaishali KulkarniIn order to support this, the verbosity mechanism contains two distinct values \myindex{\texttt{DP\_LEVEL}} and \myindex{\texttt{DP\_MODULE}} [both can be found in \texttt{ecore.h}]. Since the printing scheme in the ecore was defined with the linux limitations in mind – that is, the API [via ethtool] allowing the setting of the debug message level is only 32-bit long, both \texttt{DP\_MODULE} and \texttt{DP\_LEVEL} together contain only 32-bits.
356*14b24e2bSVaishali KulkarniThe \texttt{DP\_LEVEL} determines which prints will actually reach the logs based on the message urgency, defining 4 levels – verbose, info, notice and error. When level is set, all prints which are at least as urgent will be printed. Notice this means there’s a single level – e.g., you can’t have a configuration in which you’ll get all the `info’ level prints, but not the `notice’ level.
357*14b24e2bSVaishali KulkarniThe \texttt{DP\_MODULE} is relevant only when level is set to verbose, and it defines which of the verbose prints should reach system logs, based mostly on component/flow. When setting the module level, a bit mask of the requested components/flows is set.
358*14b24e2bSVaishali KulkarniIn order to set which prints should reach system logs, the upper layer should utilize the ecore function \myfunc{init\_dp}{init_dp} defined in \texttt{ecore\_dev.c}.
359*14b24e2bSVaishali Kulkarni
360*14b24e2bSVaishali Kulkarni\section{Compilation flags}
361*14b24e2bSVaishali KulkarniThe ecore project contains several optional compilation flags that if passed would affect the content compiled. A few notable flags:
362*14b24e2bSVaishali Kulkarni\begin{itemize}
363*14b24e2bSVaishali Kulkarni	\item ASIC\_ONLY -- By default, this is `off'. Setting this would remove content that is relevant only for simulations of the hardware, I.e., emulations and FPGAs.
364*14b24e2bSVaishali Kulkarni
365*14b24e2bSVaishali Kulkarni	\item REAL\_ASIC\_ONLY -- By default, this is `off'. Setting this would remove content that is relevant for non-productized hardware, E.g., workarounds for BigBear A0.
366*14b24e2bSVaishali Kulkarni
367*14b24e2bSVaishali Kulkarni	\item REMOVE\_DBG -- By default, this is `off'. There are several structures and field in ecore which aren't functional; there sole purpose is to store interesting data for memory dumps in case of failures. Setting this would remove all such data items.
368*14b24e2bSVaishali Kulkarni\end{itemize}
369*14b24e2bSVaishali Kulkarni
370*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
371*14b24e2bSVaishali Kulkarni
372*14b24e2bSVaishali Kulkarni
373*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
374*14b24e2bSVaishali Kulkarni\chapter{Register Access}
375*14b24e2bSVaishali Kulkarni\label{cha:reg}
376*14b24e2bSVaishali KulkarniThis section describes the ecore API for accessing registers.
377*14b24e2bSVaishali KulkarniThe E4 bar is a reduced BAR, i.e., it does not map the entire register address range.
378*14b24e2bSVaishali KulkarniTo access the entire range, windows are defined that can be configured to point to a certain address within the device and allow reading and writing of registers / memory from that address.
379*14b24e2bSVaishali KulkarniThere are two types of windows, \textbf{PTT} (per PF Translation Table) and \textbf{GTT} (Global Translation Table).
380*14b24e2bSVaishali Kulkarni
381*14b24e2bSVaishali KulkarniThe \textit{external BAR} is the BAR accessed by the ecore. It is divided into configurable windows which point to different areas within the device (Image \ref{fig:bars}, Internal BAR vs. External BAR, demonstrates this).
382*14b24e2bSVaishali Kulkarni
383*14b24e2bSVaishali Kulkarni\begin{figure}[ht]
384*14b24e2bSVaishali Kulkarni	\caption{Internal BAR vs. External BAR}
385*14b24e2bSVaishali Kulkarni	\centering
386*14b24e2bSVaishali Kulkarni	\includegraphics[width=0.8\paperwidth]{reg_access}
387*14b24e2bSVaishali Kulkarni	\label{fig:bars}
388*14b24e2bSVaishali Kulkarni\end{figure}
389*14b24e2bSVaishali Kulkarni
390*14b24e2bSVaishali KulkarniFor more details on the E4 BAR access scheme the reader is referred to the “Reduced PF BAR0 size” section of \cite{doc:PXP}. \\
391*14b24e2bSVaishali Kulkarni
392*14b24e2bSVaishali Kulkarni
393*14b24e2bSVaishali KulkarniAll register access should be done within the ecore layer and it is not expected for the upper layers to access registers at all.
394*14b24e2bSVaishali KulkarniFor this reason, there is no description here on how to find the register address and how to distinguish whether the address is mapped into a \myindex{GTT} or a \myindex{PTT}.
395*14b24e2bSVaishali KulkarniHowever, in case a need does rise in the future, API for reading/writing is detailed below as well.
396*14b24e2bSVaishali Kulkarni
397*14b24e2bSVaishali KulkarniEcore requires an OSAL implementation of the macros:
398*14b24e2bSVaishali Kulkarni\begin{enumerate}
399*14b24e2bSVaishali Kulkarni	\item \myindex{REG\_RD}
400*14b24e2bSVaishali Kulkarni	\item \myindex{REG\_WR}
401*14b24e2bSVaishali Kulkarni\end{enumerate}
402*14b24e2bSVaishali KulkarniThese macros are a direct read / write from the BAR with the absolute address offset given.
403*14b24e2bSVaishali KulkarniImplementation should add the offset to the mapped BAR address and call the appropriate OS specific API.
404*14b24e2bSVaishali Kulkarni
405*14b24e2bSVaishali KulkarniSeveral ecore interface functions require a PTT. There is a pool of PTTs maintained by ecore.
406*14b24e2bSVaishali KulkarniThe reason there are several PTTs is to enable simultaneous access to device registers from different flows.
407*14b24e2bSVaishali KulkarniThe PTT is reserved per flow, and it is the responsibility of the upper layer to make sure it does not use the same PTT in flows that can run concurrently. Upper-layer requests for a PTT entry using \myfunc{ptt\_acquire}{ptt_acquire}.
408*14b24e2bSVaishali KulkarniHowever, to avoid running out of this resource, it is also the responsibility of the upper layer not to acquire too many PTTs without releasing them. Returning a PTT entry back to the pool is done via \myfunc{ptt\_release}{ptt_release}.
409*14b24e2bSVaishali Kulkarni
410*14b24e2bSVaishali KulkarniUsing a PTT, ecore [and upper-driver] can access registers/memories using inner BAR addresses; The ecore is responsible for configuring the memory windows, and translates the inner address into an external address [i.e., one which resides on the actual BAR as seen by the host]. The register access is then made by calling \texttt{ecore\_wr} and \texttt{ecore\_rd}.
411*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
412*14b24e2bSVaishali Kulkarni
413*14b24e2bSVaishali Kulkarni
414*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
415*14b24e2bSVaishali Kulkarni\chapter{Hardware/Firmware initialization}
416*14b24e2bSVaishali Kulkarni\label{cha:hwinit}
417*14b24e2bSVaishali Kulkarni
418*14b24e2bSVaishali Kulkarni\section{Basic concepts -- inner-working of the ecore}
419*14b24e2bSVaishali Kulkarni\begin{itemize}
420*14b24e2bSVaishali Kulkarni	\item \myindex{ILT} – one of the features of our device is that the memories used by various HW blocks are allocated on the host memory [as opposed to large embedded memory segment on chip]. The driver is responsible for allocating the memory needed for those HW blocks [DMA-coherent memory] and configure both the HW blocks themselves and a communal sub-block known as ILT. The ecore contains complicated code that decides exactly how much memory each such block needs, allocates it in an ‘ilt\_shadow’, and then uses that shadow to configure the ILT itself with all the allocated chunks of memory.
421*14b24e2bSVaishali Kulkarni
422*14b24e2bSVaishali KulkarniAdditional ILT documentation is at \cite{doc:ILT}.
423*14b24e2bSVaishali Kulkarni
424*14b24e2bSVaishali Kulkarni	\item \myindex{RT array} – when the ecore initializes the HW, it utilizes a common, tool-generated code known as the init-tool. Since there are quite a few values which depend upon actual setup configuration and thus must receive feedback during the initialization from the ecore, instead of adding many such hooks there’s the concept of the RunTime array – an array of values filled by the ecore prior to the init-tool run based on the complex ecore logic. The init-tool will then utilize the values in that array to configure the HW according to the correct order of configuration [i.e., writing the values set by ecore in the array in the correct place in the initialization flow where they’re required/the block that contains them is configured].
425*14b24e2bSVaishali Kulkarni\end{itemize}
426*14b24e2bSVaishali Kulkarni
427*14b24e2bSVaishali Kulkarni\section{Initialization}
428*14b24e2bSVaishali Kulkarni\label{sec:init-init}
429*14b24e2bSVaishali KulkarniThe functions required for initializing the HW/FW mostly reside in \texttt{ecore\_dev.[ch]}; More accurately, most of the outside API [toward the upper-layer] is in \texttt{ecore\_dev.h} – the functions themselves utilize many other ecore files.
430*14b24e2bSVaishali KulkarniThis section gives a brief description of the functions that need to be called, what they do, requirements, etc., in order to successfully initialize the ecore structs and load the HW/FW.
431*14b24e2bSVaishali Kulkarni
432*14b24e2bSVaishali Kulkarni\silentfunc{init_struct}
433*14b24e2bSVaishali Kulkarni\silentfunc{hw_prepare}
434*14b24e2bSVaishali Kulkarni\silentfunc{resc_alloc}
435*14b24e2bSVaishali Kulkarni\silentfunc{resc_setup}
436*14b24e2bSVaishali Kulkarni\silentfunc{hw_init}
437*14b24e2bSVaishali Kulkarni\begin{itemize}
438*14b24e2bSVaishali Kulkarni	\item \myfunc{init\_struct}{init\_struct} – After allocating and setting of zeroes of the ecore\_dev [the upper-layer responsibility], a pointer to it should be passed to this function for some early initialization of the data structure. \\
439*14b24e2bSVaishali Kulkarni
440*14b24e2bSVaishali Kulkarni	\item \myfunc{hw\_prepare}{hw_prepare} – This function serves two purposes [plus some additional inner ecore workings]:
441*14b24e2bSVaishali Kulkarni	\begin{enumerate}
442*14b24e2bSVaishali Kulkarni		\item It enables the ecore to access its BAR, doing things such as enabling the PTT pool and opening the access in the PGLUE\_B block.
443*14b24e2bSVaishali Kulkarni		Notice this doesn’t actually do anything to the PCI BAR itself – the upper-layer should have initialized those before calling this function, and must guarantee that its REG\_WR/RD functions actually point to valid, accessible addresses.
444*14b24e2bSVaishali Kulkarni		\item It learns as much as it can about system configuration from HW and SHMEM.
445*14b24e2bSVaishali Kulkarni	\end{enumerate}
446*14b24e2bSVaishali Kulkarni
447*14b24e2bSVaishali KulkarniTrying to access registers except for pci-related ones prior to calling this function will fail. \\
448*14b24e2bSVaishali Kulkarni
449*14b24e2bSVaishali Kulkarni	\item \myfunc{resc\_alloc}{resc_alloc} – Allocates the various ecore-related memory, e.g., contexts, slowpath queue, SRIOV information, etc. Notice that before calling this function, each HW-function of the \texttt{ecore\_dev} should have its `pf\_params’ set, as the function depends upon the protocol-specific resources for its calculations. \\
450*14b24e2bSVaishali Kulkarni
451*14b24e2bSVaishali Kulkarni	\item \myfunc{resc\_setup}{resc_setup} – Configures the various slowpath elements. Notice that since there’s no guarantee chip is alive at this point [i.e., it’s very likely the chip is reset at this point], it fills the configuration in the runtime array instead of actually writing it to chip. \\
452*14b24e2bSVaishali Kulkarni
453*14b24e2bSVaishali Kulkarni	\item \myfunc{hw\_init}{hw_init} – This function actually initializes the chip, using the init-tool and the runtime array to make the correct configuration.
454*14b24e2bSVaishali Kulkarni	 As part of the slowpath interrupt enablement, ecore invokes OSAL\_SLOWPATH\_IRQ\_REQ() callback for each HW function. The client implementation should setup the IRQ handlers for slowpath interrupt handling.
455*14b24e2bSVaishali Kulkarni	 This is required since as part of the flow the \texttt{function\_start} ramrod will be sent to FW; Once FW finishes handling it, an \myindex{EQE} [Event Queue Element] will be placed in the slowpath event queue and an interrupt will be fired. The flow is dependent on the EQE being processed.
456*14b24e2bSVaishali Kulkarni
457*14b24e2bSVaishali Kulkarni	Some interesting sub-functions of the \texttt{ecore\_hw\_init()} method, at least for debugging purposes as many possible errors can be caught there:
458*14b24e2bSVaishali Kulkarni	\begin{itemize}
459*14b24e2bSVaishali Kulkarni		\item \texttt{ecore\_get\_init\_mode()} – this creates a bitmask which will be later passed to the init-tool which describes the configured mode – Multi function vs. Single function, 40G vs. 100G etc. A wrong configuration here could explain many peculiar events later on. \\
460*14b24e2bSVaishali Kulkarni
461*14b24e2bSVaishali Kulkarni%		\item ecore\_mcp\_load\_req() – the MFW [assuming it is present] will answer with one of 3 possible answers: ENGINE, PORT or FUNCTION.
462*14b24e2bSVaishali Kulkarni%		The MFW is responsible for initializing the common blocks [i.e., the HW blocks shared between the 2 engines], but the driver is responsible for the rest.
463*14b24e2bSVaishali Kulkarni%		Each function needs to perform different initialization based on whether it’s the first to load on its engine [ENGINE], the first to load on its port [PORT] or if it’s being loaded on an already initialized port [FUNCTION]\footnote{Initialization which is common for both engines will be performed by the MFW.}.
464*14b24e2bSVaishali Kulkarni%	Some very basic errors can be detected here, if the function receives an unexpected answer from MFW.
465*14b24e2bSVaishali Kulkarni	\end{itemize}
466*14b24e2bSVaishali Kulkarni
467*14b24e2bSVaishali KulkarniOnce this function returns, the chip is initialized, FW is functional and slowpath event queues are operational.
468*14b24e2bSVaishali Kulkarni
469*14b24e2bSVaishali Kulkarni\end{itemize}
470*14b24e2bSVaishali Kulkarni
471*14b24e2bSVaishali Kulkarni\section{Zipped and Binary firmware}
472*14b24e2bSVaishali Kulkarni\label{sec:init-Zipped and Binary firmware}
473*14b24e2bSVaishali Kulkarni\begin{itemize}
474*14b24e2bSVaishali Kulkarni	\item \myindex{Zipped Firmware} - There are two types of firmware files generated in ecore.\\
475*14b24e2bSVaishali KulkarniNon-zipped firmware [ecore\_init\_values.h and ecore\_init\_values.bin] and Zipped firmware [ecore\_init\_values\_zipped.h and
476*14b24e2bSVaishali Kulkarniecore\_init\_values\_zipped.bin] files. Each type of file is generated in two formats that is a C header file and binary file,
477*14b24e2bSVaishali Kulkarniwhere each has all relevant data needed to initialize the firmware. Either of these file types can be used for firmware initialization.
478*14b24e2bSVaishali KulkarniThe difference is that Zipped firmware files has lot of dmae firmware data zipped which is beneficiary in reducing the code size.\\
479*14b24e2bSVaishali Kulkarni
480*14b24e2bSVaishali KulkarniBy default, the non-zipped variant is used. If ecore clients want to use zipped version of firmware then they need to have
481*14b24e2bSVaishali KulkarniCONFIG\_ECORE\_ZIPPED\_FW defined/enabled by their operating system drivers to make feature operational. For unzipping the
482*14b24e2bSVaishali Kulkarnizipped firmware data ecore clients need to implement OSAL\_UNZIP\_DATA() as well. This OSAL is meant for unzipping the
483*14b24e2bSVaishali Kulkarnizipped firmware data in order to do firmware initialization.\\
484*14b24e2bSVaishali Kulkarni
485*14b24e2bSVaishali Kulkarni	\item \myindex{Binary Firmware} - As explained above there are two formats of firmware files
486*14b24e2bSVaishali Kulkarnigenerated by ecore, C header files [ecore\_init\_values.h and ecore\_init\_values\_zipped.h] and
487*14b24e2bSVaishali Kulkarnibinary firmware files [ecore\_init\_values.bin and ecore\_init\_values\_zipped.bin]. Either of those files formats
488*14b24e2bSVaishali Kulkarnican be used by ecore clients to utilize firmware data. By default, ecore uses the .h files which are compiled as part of the ecore,
489*14b24e2bSVaishali Kulkarnibut using binary firmware files has the advantage where the code size is reduced and the FW can be loaded from a file imported by
490*14b24e2bSVaishali Kulkarnithe system.\\
491*14b24e2bSVaishali Kulkarni
492*14b24e2bSVaishali KulkarniIf ecore clients want to use firmware data from binary files then they need to have CONFIG\_ECORE\_BINARY\_FW defined/enabled by their
493*14b24e2bSVaishali Kulkarnioperating system drivers to make feature operational. Ecore clients must store all binary firmware data from the
494*14b24e2bSVaishali Kulkarnifile in to a void* pointer and pass that firmware data buffer pointer in ecore\_hw\_init() as an argument.
495*14b24e2bSVaishali KulkarniIf ecore client is not using binary firmware file or instead using firmware from regular header files then they
496*14b24e2bSVaishali Kulkarnishould pass NULL as an argument for binary firmware data buffer in ecore\_hw\_init().
497*14b24e2bSVaishali Kulkarni
498*14b24e2bSVaishali Kulkarni
499*14b24e2bSVaishali Kulkarni\end{itemize}
500*14b24e2bSVaishali Kulkarni
501*14b24e2bSVaishali Kulkarni\section{De-Initialization}
502*14b24e2bSVaishali Kulkarni\label{sec:init-de-init}
503*14b24e2bSVaishali Kulkarni\silentfunc{hw_stop}
504*14b24e2bSVaishali Kulkarni\silentfunc{resc_free}
505*14b24e2bSVaishali Kulkarni\silentfunc{hw_remove}
506*14b24e2bSVaishali Kulkarni\begin{itemize}
507*14b24e2bSVaishali Kulkarni	\item \myfunc{hw\_stop}{hw_stop} – this function notifies the MFW that the HW-functions unload, stops the FW/HW for all HW-functions in the \texttt{ecore\_dev} including sending the common PF\_STOP ramrod for each HW-function, and disables the HW-functions in various HW blocks.
508*14b24e2bSVaishali Kulkarni	Notice that before calling this, all the protocol specifics done after initializing the HW should have already been reversed by the upper-layer [e.g., L2 VPORTs which were started by the upper layer should be stopped before calling this].
509*14b24e2bSVaishali Kulkarni	Following this function, it is guaranteed HW will not generate any more slowpath interrupts, so the interrupt handler can be released [and slowpath DPC context can be stopped]. \\
510*14b24e2bSVaishali Kulkarni
511*14b24e2bSVaishali Kulkarni	\item \myfunc{int\_disable\_post\_isr\_release}{ecore_int_disable_post_isr_release} – this function performs the required IRQ related cleanup post the ISR release. The function need to be called after releasing all slowpath IRQs of the device.
512*14b24e2bSVaishali Kulkarni
513*14b24e2bSVaishali Kulkarni	\item \myfunc{resc\_free}{resc_free} – Releases the memory allocated by the ecore during \texttt{ecore\_resc\_alloc()}. \\
514*14b24e2bSVaishali Kulkarni
515*14b24e2bSVaishali Kulkarni	\item \myfunc{hw\_remove}{hw_remove} – Release the memory allocated early by the ecore during \texttt{ecore\_hw\_prepare()}.
516*14b24e2bSVaishali Kulkarni	Following this, REG\_RD/REG\_WR are no longer operational - upper layer can disable the PCI BAR.
517*14b24e2bSVaishali Kulkarni\end{itemize}
518*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
519*14b24e2bSVaishali Kulkarni
520*14b24e2bSVaishali Kulkarni%\chapterimage{qlogic-full-36}
521*14b24e2bSVaishali Kulkarni%\chapter{Firmware hsi}
522*14b24e2bSVaishali Kulkarni%\begin{NOTICE}
523*14b24e2bSVaishali Kulkarni%Placeholder - owner Michal
524*14b24e2bSVaishali Kulkarni%\end{NOTICE}
525*14b24e2bSVaishali Kulkarni
526*14b24e2bSVaishali Kulkarni
527*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
528*14b24e2bSVaishali Kulkarni\chapter{Interrupts}
529*14b24e2bSVaishali Kulkarni\label{cha:int}
530*14b24e2bSVaishali KulkarniThis chapter describes how the device notifies the driver about operations -
531*14b24e2bSVaishali Kulkarniit describes how firmware status is reflected on host memory via status blocks, and how the firmware initiates an interrupt toward the driver.
532*14b24e2bSVaishali Kulkarni
533*14b24e2bSVaishali KulkarniA reference document that fully describes status blocks can be found at \cite{doc:SB}.
534*14b24e2bSVaishali Kulkarni
535*14b24e2bSVaishali Kulkarni
536*14b24e2bSVaishali Kulkarni\section{Status blocks - host point of view}
537*14b24e2bSVaishali KulkarniThe \myindex{status block} structures are allocated on host memory. The status block is an array of indices which are updated by firmware (mainly ring consumer values).
538*14b24e2bSVaishali KulkarniThere are 288 status blocks per path in Big Bear and 368 in K2.
539*14b24e2bSVaishali Kulkarni
540*14b24e2bSVaishali KulkarniWhen one of the indices on a status block is updated (because some event occurred at the device), the status block is copied from internal device memory to host memory, and an interrupt is generated.
541*14b24e2bSVaishali KulkarniThe CAU unit may aggregate several events and generate a single update of the status block and a single interrupt, in order to lower the number of interrupts sent to host CPU.
542*14b24e2bSVaishali Kulkarni
543*14b24e2bSVaishali KulkarniThe indices of the status blocks are referred to as \myindex{protocol indices} (abbreviated to \textit{pi}).
544*14b24e2bSVaishali KulkarniOriginally, the motivation behind multiple status blocks was to enable multiple protocols to work with the same status block, giving each protocol a different index.
545*14b24e2bSVaishali KulkarniHowever, with single personality this is no longer the case.
546*14b24e2bSVaishali KulkarniMultiple indices are used for L2 to differentiate between RX / TX and different class of service operations.
547*14b24e2bSVaishali Kulkarni
548*14b24e2bSVaishali Kulkarni\subsection{Initialization}
549*14b24e2bSVaishali Kulkarni\label{ssec:sb-init}
550*14b24e2bSVaishali KulkarniThere is a dedicated status block for ecore usage which is allocated and maintained by ecore.
551*14b24e2bSVaishali KulkarniThe fastpath status blocks used for traffic need to be allocated by the protocol driver.
552*14b24e2bSVaishali KulkarniThis memory must be DMA-coherent memory.
553*14b24e2bSVaishali KulkarniThe ecore defines a structure called \texttt{ecore\_sb\_info} which should be allocated by the protocol driver and initialized using the function \myfunc{int\_sb\_init}{int_sb_init}
554*14b24e2bSVaishali Kulkarni%[code snippet \ref{snippets/ecore_int_sb_init.h}].
555*14b24e2bSVaishali KulkarniThis structure is later used for calling the functions \texttt{ecore\_sb\_update\_sb\_idx()} and \texttt{ecore\_sb\_ack()}.
556*14b24e2bSVaishali Kulkarni
557*14b24e2bSVaishali Kulkarni%\insertcode{snippets/ecore_int_sb_init.h}{Initialize status blocks}
558*14b24e2bSVaishali Kulkarni
559*14b24e2bSVaishali Kulkarni\begin{NOTICE}
560*14b24e2bSVaishali Kulkarni	Status blocks need to be allocated and initialized before queues are created.
561*14b24e2bSVaishali Kulkarni\end{NOTICE}
562*14b24e2bSVaishali Kulkarni
563*14b24e2bSVaishali Kulkarni\section{Mode and configuration}
564*14b24e2bSVaishali KulkarniThe device can work in one of the following interrupt modes:
565*14b24e2bSVaishali Kulkarni\begin{enumerate}
566*14b24e2bSVaishali Kulkarni	\item INTA – Physical interrupt line.
567*14b24e2bSVaishali Kulkarni	\item MSI –  Message signaled interrupts. Device is programmed with one address to write to, and 16-bit data to identify the interrupt.
568*14b24e2bSVaishali Kulkarni	\item MSIX – Large number of interrupts (up to 2048) and each one gets a separate target address, making it possible to designate different interrupts to different processors.
569*14b24e2bSVaishali Kulkarni	This is the preferred mode for performance.
570*14b24e2bSVaishali Kulkarni	\item POLL – HW increments producers on status blocks in case of interrupts but it doesn't generate any message nor does it assert any physical line. It's the upper-layer responsibility to periodically poll on those changes to identify interrupts. \\
571*14b24e2bSVaishali Kulkarni\end{enumerate}
572*14b24e2bSVaishali Kulkarni
573*14b24e2bSVaishali KulkarniEnabling and disabling interrupts is OS specific and done differently by the OS specific layer of the driver.
574*14b24e2bSVaishali KulkarniHowever, the device needs to be configured differently according to the selected interrupt mode; This initialization is done by the ecore.
575*14b24e2bSVaishali Kulkarni
576*14b24e2bSVaishali KulkarniIn order to so, the proper interrupt mode using an \myindex{ecore\_int\_mode} enum [can be seen in code snippet [\ref{snippets/ecore_int_mode.h}]] needs to be passed when calling \texttt{ecore\_hw\_init}.
577*14b24e2bSVaishali Kulkarni
578*14b24e2bSVaishali Kulkarni\insertcode{snippets/ecore_int_mode.h}{Enum for the interrupt mode}
579*14b24e2bSVaishali Kulkarni
580*14b24e2bSVaishali KulkarniIf upper-layer driver would later wish to change the interrupt mode, it can do so by calling \myfunc{int\_igu\_enable\_int}{int_igu_enable_int},
581*14b24e2bSVaishali Kulkarnior to \myfunc{int\_igu\_disable\_int}{int_igu_disable_int} when wishing to disable interrupt generation altogether.
582*14b24e2bSVaishali Kulkarni
583*14b24e2bSVaishali Kulkarni%\insertcode{snippets/ecore_int_endis.h}{Functions for enabling/disabling interrupts}
584*14b24e2bSVaishali Kulkarni
585*14b24e2bSVaishali KulkarniIn MSIX mode, each status block should generate it's own interrupt message, meaning in reasonable OSes it should be possible to connect each interrupt with the specific handler of that interrupt's source.
586*14b24e2bSVaishali KulkarniThe \textit{sb\_id} passed as value to \textit{ecore\_int\_sb\_init()} will indicate the index of the vector in the MSI-X table that would be used to generate interrupts for this specific SB.
587*14b24e2bSVaishali KulkarniI.e., if the value passed is $X$, then the $X^{th}$ MSI-X vector will generate interrupts for this SB.
588*14b24e2bSVaishali Kulkarni
589*14b24e2bSVaishali KulkarniWhen working in INTA / MSI we work in single-ISR multiple-DPC mode; The same interrupt line can signify interrupts from many possible status blocks. In this case the information of which status block generated an interrupt needs to be read from a register in the IGU. Use \myfunc{int\_igu\_read\_sisr\_reg}{int_igu_read_sisr_reg} to get the information [returned value is a bitmask of status blocks which asserted the interrupt].
590*14b24e2bSVaishali Kulkarni
591*14b24e2bSVaishali Kulkarni%\insertcode{snippets/ecore_int_sisr.h}{INTA mechanism for reading interrupt source}
592*14b24e2bSVaishali Kulkarni
593*14b24e2bSVaishali Kulkarni\section{IGU block operation}
594*14b24e2bSVaishali KulkarniThe IGU block has a mapping of status blocks to interrupts.
595*14b24e2bSVaishali KulkarniThe mapping is done inside the IGU CAM and maps a (function, vector) pair to an MSI-X message.
596*14b24e2bSVaishali KulkarniIn case of INTA / MSI, each function has a register in the IGU stating which status block gave the interrupt.
597*14b24e2bSVaishali KulkarniThe IGU block is responsible for generating the interrupt. It receives the command to generate an interrupt from the CAU block.
598*14b24e2bSVaishali KulkarniThe IGU block maintains producer-consumer pairs per status block.
599*14b24e2bSVaishali KulkarniThe CAU updates the producer after it wrote the status block to host memory.
600*14b24e2bSVaishali KulkarniThe driver updates the consumer after it finished processing the status block.
601*14b24e2bSVaishali KulkarniThe IGU block generates an interrupt when there is a prod-cons difference on the status block.
602*14b24e2bSVaishali Kulkarni
603*14b24e2bSVaishali KulkarniCAU also handles coalescing of status block writes and interrupt generation.
604*14b24e2bSVaishali KulkarniThe CAU unit may aggregate several events and generate a single update of the status block and a single interrupt, in order to lower the number of interrupts sent to host CPU.
605*14b24e2bSVaishali Kulkarni
606*14b24e2bSVaishali Kulkarni\section{Interrupt handling flow}
607*14b24e2bSVaishali Kulkarni\label{sec:sb-flow}
608*14b24e2bSVaishali KulkarniThe flow of handling an interrupt in the device and driver is as follows:
609*14b24e2bSVaishali Kulkarni\silentfunc{sb_update_sb_idx}
610*14b24e2bSVaishali Kulkarni\silentfunc{sb_ack}
611*14b24e2bSVaishali Kulkarni\begin{enumerate}
612*14b24e2bSVaishali Kulkarni	\item The device (Firmware/CAU) updates a status block index.
613*14b24e2bSVaishali Kulkarni
614*14b24e2bSVaishali Kulkarni	\item The device copies the status block to host memory and generates an interrupt.
615*14b24e2bSVaishali Kulkarni
616*14b24e2bSVaishali Kulkarni	\item OS is triggered, calling the driver's Interrupt Service Routine [ISR].
617*14b24e2bSVaishali Kulkarni
618*14b24e2bSVaishali Kulkarni	\item (Possible upper-half handling and bottom-half scheduling, or other OS-specifics which are outside the scope of this document).
619*14b24e2bSVaishali Kulkarni
620*14b24e2bSVaishali Kulkarni	\item Driver identifies a producer update on the status block (as the producer is written as part of the status block on host memory) using \myfunc{sb\_update\_sb\_idx}{sb_update_sb_idx}.
621*14b24e2bSVaishali Kulkarni
622*14b24e2bSVaishali Kulkarni	\item Driver scans the protocol indices in the status block to determine the interrupt source.
623*14b24e2bSVaishali Kulkarni	\begin{NOTICE}
624*14b24e2bSVaishali Kulkarni		It's likely the upper-layer doesn't really need to scan the status block, but rather compare values in some previous-supplied addresses against a shadow copy. E.g., In L2 the ecore callbacks configuring the queues will return the addresses which upper-layer should test for producer updates. See section [\ref{sec:l2-start}].
625*14b24e2bSVaishali Kulkarni	\end{NOTICE}
626*14b24e2bSVaishali Kulkarni
627*14b24e2bSVaishali Kulkarni	\item When Driver completes processing all the indices on the status block, it writes the producer value from the status block into the IGU consumer address, using \myfunc{sb\_ack}{sb_ack}.
628*14b24e2bSVaishali Kulkarni
629*14b24e2bSVaishali Kulkarni	\item The IGU compares the producer and consumer -- if they differ it will generate an additional interrupt.
630*14b24e2bSVaishali Kulkarni
631*14b24e2bSVaishali Kulkarni\end{enumerate}
632*14b24e2bSVaishali Kulkarni
633*14b24e2bSVaishali Kulkarni\begin{exampleT}
634*14b24e2bSVaishali Kulkarni	Assume an Rx packet is received by device. After FW places the packet in the Rx rings, it updates the status block of that Rx ring; This in turn is copied into host memory and an MSI-X interrupt for the appropriate Rx queue's status block is triggered.
635*14b24e2bSVaishali Kulkarni	Driver reads the status blocks, scanning the indicies and identifies the interrupt is an Rx CQE consumer and handles the incoming packet. Assuming this is the only interrupt source [and there was also a single packet] driver than acks the status block.
636*14b24e2bSVaishali Kulkarni\end{exampleT}
637*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
638*14b24e2bSVaishali Kulkarni
639*14b24e2bSVaishali Kulkarni
640*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
641*14b24e2bSVaishali Kulkarni\chapter{Management firmware [MFW] interface}
642*14b24e2bSVaishali Kulkarni\label{cha:mfw}
643*14b24e2bSVaishali Kulkarni
644*14b24e2bSVaishali KulkarniThe management firmware runs on its own processor on the chip [\myindex{MCP}] and has many responsibilities – it serves as the entity initially configuring the chip [during bios phase], answering the various management protocols, synchronizing between PFs, configuring the physical link, etc.
645*14b24e2bSVaishali KulkarniHW functions and the \myindex{MFW} may interact with each other in both ways – driver may send messages to the MFW in the form of commands on a buffer, while the MFW generates attentions for the driver and posts messages in a designated mailbox in the SHMEM. The implementation of the interface resides in \texttt{ecore\_mcp.[ch]}, with the addition of \texttt{.h} files generated by the MFW owners, e.g., \texttt{mcp\_public.h} which contains the SHMEM structure and the list of commands.
646*14b24e2bSVaishali KulkarniThe API that should be included by upper-layer driver is defined in \texttt{ecore\_mcp\_api.h}.
647*14b24e2bSVaishali Kulkarni
648*14b24e2bSVaishali KulkarniThe interface between driver and MFW is initialized as early as possible in the initial initialization flow [specifically as part of \texttt{ecore\_hw\_prepare()}],  as this initializes the Driver access to SHMEM which is used later during initialization to learn about the chip configuration [which was read from NVRAM by MFW and written into SHMEM].
649*14b24e2bSVaishali KulkarniThe upper layer doesn’t need to take care of allocating/releasing of this interface – it’s part of the greater initialization/de-initialization of the ecore.
650*14b24e2bSVaishali Kulkarni
651*14b24e2bSVaishali Kulkarni\section{Shared Memory [SHMEM]}
652*14b24e2bSVaishali KulkarniThe \myindex{shared memory} is a segment of memory accessible to all functions as well as the MFW. The memory is used for various purposes:
653*14b24e2bSVaishali Kulkarni\begin{enumerate}
654*14b24e2bSVaishali Kulkarni	\item MFW fills it with current HW configuration, either based on the default found in the NVRAM or based on some management-protocol [e.g., it’s possible vlans configuration is determined by switch and communicated to the MFW]. Driver reads those values and decides upon its logical state/configures HW appropriately. \\
655*14b24e2bSVaishali Kulkarni
656*14b24e2bSVaishali Kulkarni	\item The driver--MFW interface is based on mailboxes in well-known addresses in the SHMEM. \\
657*14b24e2bSVaishali Kulkarni
658*14b24e2bSVaishali Kulkarni	\item It’s possible [as in E3] that there will be driver-held information that will be requested by some management-protocol, and the driver will have to fill it in some well-known address in the SHMEM.
659*14b24e2bSVaishali Kulkarni\end{enumerate}
660*14b24e2bSVaishali Kulkarni
661*14b24e2bSVaishali KulkarniAn upper-layer driver is not supposed to access the SHMEM directly; It should only do so by using ecore functions and accessing ecore structs. The ecore \textit{mcp\_info} struct contains as one of its fields \textit{func\_info} which is filled by the ecore during early device initialization with all the function-specific static\footnote{i.e., data that shouldn't change while driver is running} data. Upper-layer driver can read those values for its own usage.
662*14b24e2bSVaishali Kulkarni
663*14b24e2bSVaishali Kulkarni\section{Ecore - MFW interface}
664*14b24e2bSVaishali Kulkarni\begin{itemize}
665*14b24e2bSVaishali Kulkarni	\item Sending messages from driver to MFW -- Each HW-function has an address in the SHMEM in which the MFW will poll for messages from that HW-function.
666*14b24e2bSVaishali Kulkarni	A message is a u32 consisting of a command bit-mask which indicates of the message the HW-functions sends and a cyclic sequential number.
667*14b24e2bSVaishali Kulkarni	In addition there’s another u32 field which might contain additional parameters [command-specific].
668*14b24e2bSVaishali Kulkarni	The driver increases the sequence number and writes the message and then polls until the MFW writes its response [with the correct sequence number] to another known address in SHMEM\footnote{Obviously, this is a one-pending mechanism.}
669*14b24e2bSVaishali Kulkarni	The MFW can also send an additional parameter [command-specific]. \\
670*14b24e2bSVaishali Kulkarni
671*14b24e2bSVaishali Kulkarni	\item Messages from MFW to driver -- MFW will trigger a general HW attention which will be handled by the specific HW-function [there’s a different general HW attention per HW-function].
672*14b24e2bSVaishali Kulkarni	Per-HW-function there’s an array of message producers in SHMEM,  of which the ecore maintains a copy.
673*14b24e2bSVaishali Kulkarni	Before sending the attention, the MFW will increment the producer of the message it wishes to inform the driver and the driver will recognize the message by noticing the difference in producers.
674*14b24e2bSVaishali Kulkarni	After handling said message, the driver will ack the message by writing the new producer back to SHMEM and disabling the general HW attention.
675*14b24e2bSVaishali Kulkarni	Notice it's [at least theoretically] possible for the ecore to encounter multiple MFW messages following a single attention from HW. \\
676*14b24e2bSVaishali Kulkarni\end{itemize}
677*14b24e2bSVaishali Kulkarni
678*14b24e2bSVaishali KulkarniNotice the commands’ content vary -- some of the commands will require additional parameters to be filled in specific fields in the SHMEM before the commands are passed.
679*14b24e2bSVaishali Kulkarni
680*14b24e2bSVaishali Kulkarni\section{API between ecore's MCP interface and upper-layer driver}
681*14b24e2bSVaishali Kulkarni\myfunc{mcp\_cmd}{mcp_cmd} --  this is the very core of message-passing from driver to MFW. Upper-layer driver should pass the command (FW\_MSG\_CODE\_* from \texttt{mcp\_public.h}) and a parameter, as well as pointers for the MFW response and additional possible parameter. The function will pass the command for MFW and await [sleep] for its reply. \\
682*14b24e2bSVaishali Kulkarni
683*14b24e2bSVaishali KulkarniA ‘special’ instance of this function is \texttt{ecore\_mcp\_load\_req()} [which isn’t an API function] - that function sends an indication to the MCP that the HW-function is being loaded.
684*14b24e2bSVaishali KulkarniThe MFW is used as both a book-keeper and synchronization mechanism for the loading of PFs, as there are communal resources. The response will be (FW\_MSG\_CODE\_DRV\_LOAD\_<X>), where X can be either ENGINE, PORT or FUNCTION:
685*14b24e2bSVaishali Kulkarni\begin{itemize}
686*14b24e2bSVaishali Kulkarni	\item Engine – HW-function is the first being loaded on its engine.
687*14b24e2bSVaishali Kulkarni	\item Port – Another HW-function has already initialized the engine, but this HW-function is first on its port.
688*14b24e2bSVaishali Kulkarni	\item Function – Another HW-function has already initialized the port.
689*14b24e2bSVaishali Kulkarni\end{itemize}
690*14b24e2bSVaishali KulkarniAccording to the MFW response the ecore knows what need to be initialized. \\
691*14b24e2bSVaishali Kulkarni
692*14b24e2bSVaishali Kulkarni\texttt{ecore\_handle\_mcp\_events()} – This function is called from the slowpath interrupt context [sleepless] upon MFW attention to the driver.
693*14b24e2bSVaishali KulkarniDependent on the exact message received from the MFW, it’s possible that this will eventually will call some OSAL which will need to be implemented by the upper-layer driver, e.g., in case of link change indication [The upper-layer needs to be notified and should decide on its own what to do with that information].
694*14b24e2bSVaishali Kulkarni
695*14b24e2bSVaishali Kulkarni\section{Link Interface}
696*14b24e2bSVaishali Kulkarni\label{sec:mfw-link}
697*14b24e2bSVaishali KulkarniThe MFW is responsible for configuring the physical link [i.e., MAC, PHY, etc.]. The ecore encapsulates the entire interface with MFW for configuring the link, leaving a relatively narrow API with the upper-layer driver.
698*14b24e2bSVaishali KulkarniThe ecore HW-function contains 2 related strctures –
699*14b24e2bSVaishali Kulkarni\silentfunc{mcp_get_link_params}
700*14b24e2bSVaishali Kulkarni\silentfunc{mcp_get_link_state}
701*14b24e2bSVaishali Kulkarni\begin{itemize}
702*14b24e2bSVaishali Kulkarni	\item Link\_params – The ecore uses this as inputs for configuring the link; According to the values in this struct, the ecore will later configure shmem in the appropriate places so that once the MFW receives the command to set the link it will use this configuratio.
703*14b24e2bSVaishali Kulkarni	During ecore initialization, the ecore will fill this structure with the default values from SHMEM [values set by MFW according to NVRAM configuration]
704*14b24e2bSVaishali KulkarniWhen upper-layer driver wishes to update link configuration, it should change this struct.
705*14b24e2bSVaishali KulkarniIt can access it by calling \myfunc{mcp\_get\_link\_params}{mcp_get_link_params} \\
706*14b24e2bSVaishali Kulkarni
707*14b24e2bSVaishali Kulkarni	\item Link\_output – The ecore fills the structure from attention handling context whenever the MFW indicates that a link change has occurred. Upper layer driver can read this to get information about the current state of the physical link. It can access this struct by calling \myfunc{mcp\_get\_link\_state}{mcp_get_link_state}.\\
708*14b24e2bSVaishali Kulkarni\end{itemize}
709*14b24e2bSVaishali Kulkarni
710*14b24e2bSVaishali KulkarniIn order to work with the ecore link interface, upper driver needs to implement an OSAL [\texttt{osal\_link\_update()}] which will be called whenever the link state has changed – this will notify the upper driver that the link has changed and that it should probably read link\_output and act upon it. \\
711*14b24e2bSVaishali Kulkarni
712*14b24e2bSVaishali KulkarniIn order to set/reset the link, the upper driver should call \myfunc{mcp\_set\_link}{mcp_set_link} after overriding the link\_params fields with its required link configured [optional, as without doing anything the structure will contain the default link configuration found in SHMEM].
713*14b24e2bSVaishali KulkarniPassing true will cause MFW to try setting the link [either by force or via auto-negotiation, based on the configuration], while passing false will cause the MFW to reset the link.
714*14b24e2bSVaishali Kulkarni
715*14b24e2bSVaishali KulkarniNotice the logic for link-flap-avoidance should be contained in MFW, e.g., in multi-function mode there’s no need for the upper-layer driver to count the number of functions loaded in order to decide whether during unload it should request a link reset; It should do it regardless.
716*14b24e2bSVaishali KulkarniIt’s the MFW's duty to decide whether the unloading function is actually the last loaded function on its port and thus whether to actually reset the link.
717*14b24e2bSVaishali Kulkarni
718*14b24e2bSVaishali Kulkarni\subsection{Energy Efficient Ethernet (EEE)}
719*14b24e2bSVaishali KulkarniEEE feature enables the device to put its transistors in sleep mode when there is no data activity on the wire. Hence achieves the significant reduction in the power consumption of the device. It's a Base-T feature, more details of which are captured under IEEE 802.3az standard. MFW negotiates the EEE parameters with the peer device and the results will be shared to the ecore as part of link notification. Following are the negotiated parameters which will be encapsulated in the struct \texttt{ecore\_mcp\_link\_state}.
720*14b24e2bSVaishali Kulkarni\begin{itemize}
721*14b24e2bSVaishali Kulkarni	\item eee\_active – EEE is negotiated and is currently operational.
722*14b24e2bSVaishali Kulkarni	\item eee\_adv\_caps – Device advertized capabilities.
723*14b24e2bSVaishali Kulkarni	\item eee\_lpi\_adv\_caps – Peer device advertized capabilities.
724*14b24e2bSVaishali Kulkarni\end{itemize}
725*14b24e2bSVaishali KulkarniFollowing are the EEE link parameters which can be queried by upper layer driver using \myfunc{mcp\_get\_link\_params}{mcp_get_link_params} API.
726*14b24e2bSVaishali Kulkarni\begin{itemize}
727*14b24e2bSVaishali Kulkarni	\item eee\_enable – EEE is enabled.
728*14b24e2bSVaishali Kulkarni	\item eee\_supported – Device supports EEE.
729*14b24e2bSVaishali Kulkarni	\item eee\_tx\_lpi\_enable – Determines whether the device should assert its Tx LPI.
730*14b24e2bSVaishali Kulkarni	\item eee\_tx\_lpi\_timer – EEE delay timer value, i.e., amount of time device should stay in idle mode prior to asserting its Tx LPI  (in  microseconds).
731*14b24e2bSVaishali Kulkarni\end{itemize}
732*14b24e2bSVaishali KulkarniUpper layer driver can configure the one or more of the EEE following parameters.
733*14b24e2bSVaishali Kulkarni\begin{itemize}
734*14b24e2bSVaishali Kulkarni	\item eee\_enable
735*14b24e2bSVaishali Kulkarni	\item eee\_adv\_caps
736*14b24e2bSVaishali Kulkarni	\item eee\_tx\_lpi\_enable
737*14b24e2bSVaishali Kulkarni	\item eee\_tx\_lpi\_timer
738*14b24e2bSVaishali Kulkarni\end{itemize}
739*14b24e2bSVaishali Kulkarni
740*14b24e2bSVaishali Kulkarni\section{Dcbx Interface}
741*14b24e2bSVaishali Kulkarni\label{sec:mfw-dcbx}
742*14b24e2bSVaishali KulkarniThe MFW is responsible for negotiating the dcbx parameters [e.g., per priority flow control (PFC)] with peer device. During initialization, MFW reads the dcbx parameters from NVRAM (called local parameters) and negotiates these with the peer. The negotiated/agreed parameters are called operational dcbx parameters. MFW provides driver interfaces for querying and configuring the dcbx parameters. The ecore dcbx implementation provides three APIs, one for querying the dcbx paramters and the other two for updating the dcbx configuration.
743*14b24e2bSVaishali Kulkarni\silentfunc{dcbx_query_params}
744*14b24e2bSVaishali Kulkarni\silentfunc{dcbx_get_config_params}
745*14b24e2bSVaishali Kulkarni\silentfunc{dcbx_config_params}
746*14b24e2bSVaishali Kulkarni\begin{itemize}
747*14b24e2bSVaishali Kulkarni	\item \myfunc{dcbx\_query\_params}{dcbx\_query\_params} – The API returns the current dcbx configuration. It expects type (i.e., local/remote/operational) and the buffer for storing the dcbx parameters of that type.\\
748*14b24e2bSVaishali Kulkarni
749*14b24e2bSVaishali Kulkarni	\item \myfunc{dcbx\_get\_config\_params}{dcbx\_get\_config\_params} - The API returns the currently cached dcbx parameter set that can be modified for making the dcbx update requests. \\
750*14b24e2bSVaishali Kulkarni
751*14b24e2bSVaishali Kulkarni	\item \myfunc{dcbx\_config\_params}{dcbx\_config\_params} – The API is used for sending the dcbx parameters update request. The API expects dcbx parameters to be configured and the flag specifying whether the parameters need to be sent to hardware or just cache at the ecore. When driver sends dcbx config to the hardware, device initiates the dcbx negotiation with the peer using lldp protocol. The negotiation takes few seconds to complete, and also the lldp requests are rate limited (using a predefined credit value). The dcbx API option “hw\_commit” specifies whether the dcbx parameters need to be committed to the hardware or just cache at the driver. When client requests the commit, all the cached parameters are sent to the device and the parameter negotiation will be initiated with the peer. \\
752*14b24e2bSVaishali Kulkarni\end{itemize}
753*14b24e2bSVaishali KulkarniThe steps for configuring the dcbx parameters are, upper layer driver invokes ecore\_dcbx\_get\_config\_params() API to get the current config parameter set, and update the required parameters, and then invoke ecore\_dcbx\_config\_params() API.
754*14b24e2bSVaishali Kulkarni
755*14b24e2bSVaishali KulkarniIf there is any change in the dcbx configuration at the host (for example due to a negotiation with the peer), then MFW notifies the same to ecore. OSAL\_DCBX\_AEN() would be called after such notification, ecore client would need to provide the implementation for this OSAL.
756*14b24e2bSVaishali Kulkarni
757*14b24e2bSVaishali Kulkarni\section{Management protocol APIs}
758*14b24e2bSVaishali Kulkarni\label{sec:mfw-protocols}
759*14b24e2bSVaishali KulkarniMFW needs various bits of information from the driver, and it gathers those in one of two methods:
760*14b24e2bSVaishali Kulkarni\begin{itemize}
761*14b24e2bSVaishali Kulkarni	\item Pulling – if ecore can’t provide information on its own, ecore-client would be required to implement an OSAL.\\
762*14b24e2bSVaishali Kulkarni	\item Pushing – it’s the ecore and ecore-client’s responsibility to push the data.\\
763*14b24e2bSVaishali Kulkarni\end{itemize}
764*14b24e2bSVaishali KulkarniIn some cases, ‘Push’ is done without involvement of the ecore-client. If that’s not possible, it becomes more risky as the responsibility of doing things correctly passes to the ecore-client. Ecore-client shouldn’t presume to do ‘push’ only for calls which match the configured management mode. Instead it should always do them and let the ecore be the arbiter of whether those are needed by MFW or not. Ecore provides the following APIs for updating the configuration attributes, it is the client's responsibility to invoke these APIs at the appropriate time.
765*14b24e2bSVaishali Kulkarni\silentfunc{mcp_ov_update_current_config}
766*14b24e2bSVaishali Kulkarni\silentfunc{mcp_ov_update_mtu}
767*14b24e2bSVaishali Kulkarni\silentfunc{mcp_ov_update_mac}
768*14b24e2bSVaishali Kulkarni\silentfunc{mcp_ov_update_wol}
769*14b24e2bSVaishali Kulkarni\silentfunc{mcp_ov_update_driver_state}
770*14b24e2bSVaishali Kulkarni\silentfunc{mcp_update_fcoe_cvid}
771*14b24e2bSVaishali Kulkarni\silentfunc{mcp_update_fcoe_fabric_name}
772*14b24e2bSVaishali Kulkarni\begin{itemize}
773*14b24e2bSVaishali Kulkarni	\item \myfunc{mcp\_ov\_update\_current\_config}{mcp\_ov\_update\_current\_config} – Drivers need to call this API when user updates one (or more) of the following: mtu, primary mac or Wake on LAN settings (to a non-default value). In addition, it also needs to call a unique API per each:
774*14b24e2bSVaishali Kulkarni	\begin{itemize}
775*14b24e2bSVaishali Kulkarni		\item \myfunc{mcp\_ov\_update\_mtu}{mcp\_ov\_update\_mtu} – called when user sets the mtu to a value other than the default provided by the ecore.\\
776*14b24e2bSVaishali Kulkarni
777*14b24e2bSVaishali Kulkarni		\item \myfunc{mcp\_ov\_update\_mac}{mcp\_ov\_update\_mac} – called when user updates the primary mac address.\\
778*14b24e2bSVaishali Kulkarni
779*14b24e2bSVaishali Kulkarni		\item \myfunc{mcp\_ov\_update\_wol}{mcp\_ov\_update\_wol} – called when Wake-on-LAN settings are updated.\\
780*14b24e2bSVaishali Kulkarni	\end{itemize}
781*14b24e2bSVaishali Kulkarni	\item \myfunc{mcp\_ov\_update\_driver\_state}{mcp\_ov\_update\_driver\_state} – notify about a change in the driver state. Following are the possible driver states,
782*14b24e2bSVaishali Kulkarni	\begin{itemize}
783*14b24e2bSVaishali Kulkarni		\item ECORE\_OV\_DRIVER\_STATE\_NOT\_LOADED - Firmware is not loaded.\\
784*14b24e2bSVaishali Kulkarni
785*14b24e2bSVaishali Kulkarni		\item ECORE\_OV\_DRIVER\_STATE\_DISABLED - Driver is not ready yet.\\
786*14b24e2bSVaishali Kulkarni
787*14b24e2bSVaishali Kulkarni		\item ECORE\_OV\_DRIVER\_STATE\_ACTIVE - Driver is operational.\\
788*14b24e2bSVaishali Kulkarni	\end{itemize}
789*14b24e2bSVaishali Kulkarni	Ecore sets the following driver states,
790*14b24e2bSVaishali Kulkarni	\begin{itemize}
791*14b24e2bSVaishali Kulkarni		\item DISABLED - After firmware is successfully loaded on the device, ecore updates the driver state as DISABLED (as part of ecore\_hw\_init() implementation). \\
792*14b24e2bSVaishali Kulkarni		\item NOT\_LOADED - Ecore sets this state when the protocol driver is unloaded (as part of ecore\_hw\_remove()).\\
793*14b24e2bSVaishali Kulkarni	\end{itemize}
794*14b24e2bSVaishali Kulkarni	It's the protocol driver's responsibility to alternate between the states,
795*14b24e2bSVaishali Kulkarni	\begin{itemize}
796*14b24e2bSVaishali Kulkarni		\item ACTIVE - Set when the required initialization is done from the driver side and the device is ready for traffic switching.\\
797*14b24e2bSVaishali Kulkarni		\item DISABLED - Set when device is not operational (e.g., fastpath queues are released or not configured).\\
798*14b24e2bSVaishali Kulkarni	\end{itemize}
799*14b24e2bSVaishali Kulkarni	\item \myfunc{mcp\_update\_fcoe\_cvid}{mcp_update_fcoe_cvid} - Update MFW with the 802.1q fcoe vlan id assigned for the PF.\\
800*14b24e2bSVaishali Kulkarni	\item \myfunc{mcp\_update\_fcoe\_fabric\_name}{mcp_update_fcoe_fabric_name} - Update fabric name value to the MFW. Fabric name is the value returned by the fabric domain controller in response to a GS-FC “Get Fabric Name” command from the adapter.\\
801*14b24e2bSVaishali Kulkarni\end{itemize}
802*14b24e2bSVaishali Kulkarni
803*14b24e2bSVaishali KulkarniEcore also provides the TLV request interface for MFW for querying the driver/device attributes. MFW uses mailbox interface to notify ecore on the required TLV information. Ecore parses the request, populates the required information with the help of ecore clients and sends it to the MFW. Ecore client need to provide necessary infrastructure and the OSALs for implementing this interface.
804*14b24e2bSVaishali Kulkarni\begin{itemize}
805*14b24e2bSVaishali Kulkarni	\item OSAL\_MFW\_TLV\_REQ - The call indicates that ecore has received a TLV request notification from the MFW. The execution context in interrupt mode, hence ecore client need to schedule a thread/bottom-half context to handle this task, and return the control immediately. The bottom-half thread will need to invoke \myfunc{mfw\_process\_tlv\_req}{mfw_process_tlv_req} for further processing of the TLV request.\\
806*14b24e2bSVaishali Kulkarni	\item OSAL\_MFW\_FILL\_TLV\_DATA - Ecore invokes this callback to get the TLV values of a given type. Ecore client need to fill in the values for all the fields that it's aware of, and also need to set the flags associated with the respective fields. For instance,  if client sets value for 'npiv\_enabled' field, it needs to set the flag 'npiv\_enabled\_set' to true.\\
807*14b24e2bSVaishali Kulkarni\end{itemize}
808*14b24e2bSVaishali Kulkarni
809*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
810*14b24e2bSVaishali Kulkarni
811*14b24e2bSVaishali Kulkarni
812*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
813*14b24e2bSVaishali Kulkarni\chapter{L2 protocol}
814*14b24e2bSVaishali Kulkarni\label{cha:l2}
815*14b24e2bSVaishali Kulkarni
816*14b24e2bSVaishali Kulkarni\section{L2-related terminology}
817*14b24e2bSVaishali KulkarniThis section describes in a very highlevel manner several FW objects which are related to L2. Developers implementing L2 support over the ecore should be familiar with these\footnote{Probably even more than is in the scope of this document.}.
818*14b24e2bSVaishali Kulkarni\begin{itemize}
819*14b24e2bSVaishali Kulkarni	\item Virtual port [\myindex{VPORT}] -- Can simply be seen as a collection of queues, each HW-function will have at least one VPORT configured\footnote{And in most scenarios one will suffice.}. Classifications are configured per-VPORT. \\
820*14b24e2bSVaishali Kulkarni
821*14b24e2bSVaishali Kulkarni	\item Queues -- Either Rx/Tx, queues are attached to a VPORT. There can multiple queues per-VPORT [e.g., if RSS/TSS is supported]. Usually, each Rx queue will use it's own status block for interrupts upon Rx packets but Tx queues can utilize the same status blocks, using different protocol indices. \\
822*14b24e2bSVaishali Kulkarni\end{itemize}
823*14b24e2bSVaishali Kulkarni
824*14b24e2bSVaishali Kulkarni\section{Starting an L2 device}
825*14b24e2bSVaishali Kulkarni\label{sec:l2-start}
826*14b24e2bSVaishali KulkarniThis section begins after section \ref{sec:init-init}, I.e., assuming the HW-function has already been initialized by the init tool and the PF\_START ramrod has already been sent.
827*14b24e2bSVaishali Kulkarni
828*14b24e2bSVaishali Kulkarni	\begin{NOTICE}
829*14b24e2bSVaishali Kulkarni	Although VPORTs' and queues' indices are shared between all HW-function on the same engine, the resource allocation scheme determines a range of VPORTs per-HW-function to use for configuration [i.e., developer can assume starting index is always 0 per-HW-function].
830*14b24e2bSVaishali Kulkarni	\end{NOTICE}
831*14b24e2bSVaishali Kulkarni
832*14b24e2bSVaishali Kulkarni
833*14b24e2bSVaishali Kulkarni\silentfunc{sp_vport_start}
834*14b24e2bSVaishali Kulkarni\silentfunc{eth_rx_queue_start}
835*14b24e2bSVaishali Kulkarni\silentfunc{eth_tx_queue_start}
836*14b24e2bSVaishali Kulkarni\silentfunc{sp_vport_update}
837*14b24e2bSVaishali Kulkarni\begin{enumerate}
838*14b24e2bSVaishali Kulkarni	\item \myfunc{sp\_vport\_start}{sp_vport_start} -- this function initializes a vport in FW [ETH\_RAMROD\_VPORT\_START will be sent]. The handle for this function is a \texttt{vport\_id} which is passed and the most 'interesting' argument is the MTU for that VPORT.
839*14b24e2bSVaishali Kulkarni	This VPORT will be inactive after sending this ramrod, i.e., until enabling it via a vport update it will not actually perform Rx/Tx. \\
840*14b24e2bSVaishali Kulkarni
841*14b24e2bSVaishali Kulkarni	\item \myfunc{eth\_rx\_queue\_start}{eth_rx_queue_start} -- initializes an rx queue on a given VPORT.
842*14b24e2bSVaishali Kulkarni	A pre-request is that the VPORT has already been initialized.
843*14b24e2bSVaishali Kulkarni	There are 2 identifier of the queue - the queue index to add and the VPORT index to add it to. The queue-index should be unique for the Rx-queue; No 2 Rx-queues of the same PF should use the same id.
844*14b24e2bSVaishali Kulkarni	There are quite a few parameters that need to be supplied, e.g., status block, physical addresses of rings, etc.
845*14b24e2bSVaishali Kulkarni
846*14b24e2bSVaishali Kulkarni	The function is expected to receive a pointer to a \texttt{p\_ret\_params} which it will fill with outputs [upon success]. The ecore would fill the address where producer-updates need to be written [in the storm's RAM]; The upper-driver will write producer updates to that address to replenish its Rx-rings.
847*14b24e2bSVaishali Kulkarni
848*14b24e2bSVaishali Kulkarni	\begin{NOTICE}
849*14b24e2bSVaishali Kulkarni	Address is mapped by GTT, so upper-driver can simply write to that address, using the necessary memory barriers.
850*14b24e2bSVaishali Kulkarni	\end{NOTICE}
851*14b24e2bSVaishali Kulkarni	In addition, ecore would also fill a \texttt{p\_handle}. This handle is opaque to the ecore-client, and should be passed to other Rx-queue APIs when doing configuration relating to that queue.
852*14b24e2bSVaishali Kulkarni
853*14b24e2bSVaishali Kulkarni	After calling this function, upper-layer driver should initialize the Rx packets producers. \\
854*14b24e2bSVaishali Kulkarni
855*14b24e2bSVaishali Kulkarni	\item \myfunc{eth\_tx\_queue\_start}{eth_tx_queue_start} -- initializes a Tx queue on a given VPORT0.
856*14b24e2bSVaishali Kulkarni		Very similar to the Rx queue start method, with some slight differences in the parameters [BD ring address instead of Rx rings, etc.]. For Tx-queues, the same queue-id can be shared between 2 different queues. That would cause those queues to share the same coalescing configuration.
857*14b24e2bSVaishali Kulkarni		Just like for Rx-queues, the ecore would fill the \texttt{p\_ret\_params} with an opaque handler to be used for further calls relating to this queue. In addition, it will provide a \texttt{p\_doorbell} address, which is an address into which a doorbell needs to be written to activate firmware once a packet is placed on this Tx queue and the buffer descriptors are filled.
858*14b24e2bSVaishali Kulkarni		\begin{NOTICE}
859*14b24e2bSVaishali Kulkarni		Doorbell addresses are on a different BAR than that of other memories/registers accessed by driver, and the PTT/GTT scheme does not apply to it; Thus the address can simply be accessed using the necessary memory barriers.
860*14b24e2bSVaishali Kulkarni		\end{NOTICE}
861*14b24e2bSVaishali Kulkarni
862*14b24e2bSVaishali Kulkarni	\item \myfunc{sp\_vport\_update}{sp_vport_update} -- This is required to enable the VPORT. It should be called after the Tx/Rx queues were already added, and this will enable the VPORT to send and receive packets\footnote{Notice that without classification configuration Rx won't actually work. Also notice this function can do a lot of things; Enabling the VPORT is only one of them.}.
863*14b24e2bSVaishali Kulkarni
864*14b24e2bSVaishali Kulkarni	In order to enable the VPORT for traffic, the upper-layer driver should set in \texttt{p\_params} the following:
865*14b24e2bSVaishali Kulkarni	\begin{enumerate}
866*14b24e2bSVaishali Kulkarni		\item \texttt{update\_vport\_active\_flg} to 1.
867*14b24e2bSVaishali Kulkarni		\item \texttt{vport\_active\_flg} to 1.
868*14b24e2bSVaishali Kulkarni	\end{enumerate}
869*14b24e2bSVaishali Kulkarni
870*14b24e2bSVaishali Kulkarni\end{enumerate}
871*14b24e2bSVaishali Kulkarni
872*14b24e2bSVaishali Kulkarni\section{Configuring Classifications}
873*14b24e2bSVaishali Kulkarni\label{sec:l2-class}
874*14b24e2bSVaishali KulkarniClassification configuration consists [mostly] of three thing:
875*14b24e2bSVaishali Kulkarni\begin{enumerate}
876*14b24e2bSVaishali Kulkarni	\item Configuration of the \myindex{Rx mode} -- This defines which datagrams [unicast, multicast, broadcast] should be accepted by the VPORT, and whether all such datagrams or only if a filter is configured for them.
877*14b24e2bSVaishali Kulkarni	\item Configuration of unicast / multicast filters -- defining filters for specific unicast / multicast addresses which should be matched, given that Rx mode  agrees.
878*14b24e2bSVaishali Kulkarni	\item Configuration of vlan filters -- by default, all vlans will be accepted. If at least one vlan [or vlan-mac] filter will be configured only traffic which matches one of the configured vlan filters will pass through.
879*14b24e2bSVaishali Kulkarni\end{enumerate}
880*14b24e2bSVaishali Kulkarni
881*14b24e2bSVaishali KulkarniThere are several ecore functions which are responsible for configuring classifications:
882*14b24e2bSVaishali Kulkarni
883*14b24e2bSVaishali Kulkarni\silentfunc{filter_accept_cmd}
884*14b24e2bSVaishali Kulkarni\silentfunc{sp_eth_filter_ucast}
885*14b24e2bSVaishali Kulkarni\silentfunc{filter_mcast_cmd}
886*14b24e2bSVaishali Kulkarni\begin{itemize}
887*14b24e2bSVaishali Kulkarni	\item \myfunc{filter\_accept\_cmd}{filter_accept_cmd} -- configures the Rx mode of the device.
888*14b24e2bSVaishali Kulkarni	\item \myfunc{sp\_vport\_update}{sp_vport_update} -- although not exactly a classification function, calling this will re-set the Rx mode [this calls \texttt{ecore\_filter\_accept\_cmd()} as part of its work].
889*14b24e2bSVaishali Kulkarni	\item \myfunc{sp\_eth\_filter\_ucast}{sp_eth_filter_ucast} -- configures either a unicast filter, vlan filter or a unicast/vlan filter pair.
890*14b24e2bSVaishali Kulkarni		An important parameter for the upper-layer driver\footnote{in the sense that it might affect design, since all fields are relevant.} is the `opcode' field:
891*14b24e2bSVaishali Kulkarni		\begin{itemize}
892*14b24e2bSVaishali Kulkarni			\item ECORE\_FILTER\_ADD -- adds a new filter.
893*14b24e2bSVaishali Kulkarni			\item ECORE\_FILTER\_REMOVE -- removes a filter.
894*14b24e2bSVaishali Kulkarni			\item ECORE\_FILTER\_MOVE -- removes a filter from one vport and adds it to another simultaneously\footnote{Needed by windows.}.
895*14b24e2bSVaishali Kulkarni			\item ECORE\_FILTER\_REPLACE -- adds a new filter after removing all previously configured filters.
896*14b24e2bSVaishali Kulkarni		\end{itemize}
897*14b24e2bSVaishali Kulkarni
898*14b24e2bSVaishali Kulkarni	\item \myfunc{filter\_mcast\_cmd}{filter_mcast_cmd} -- configures a multicast filter.
899*14b24e2bSVaishali Kulkarni	\begin{warning}
900*14b24e2bSVaishali Kulkarni		This function exists in the ecore but at the moment it's not implemented.
901*14b24e2bSVaishali Kulkarni	\end{warning}
902*14b24e2bSVaishali Kulkarni
903*14b24e2bSVaishali Kulkarni\end{itemize}
904*14b24e2bSVaishali KulkarniThese functions expose the \texttt{ecore\_spq} implementation -- upper-driver layer can choose whether to wait for completion, supply a callback for completion or do-nothing upon completion (the last will usually be the chosen path).
905*14b24e2bSVaishali Kulkarni
906*14b24e2bSVaishali Kulkarni\section{Stopping an L2 device}
907*14b24e2bSVaishali KulkarniThis is pretty straight forward, and works in reverse-order to the initialization of the L2 device.
908*14b24e2bSVaishali KulkarniAfter upper-layer driver guarantees that no new Tx-packets will be generated and once Tx queues are all empty, it should do the following:
909*14b24e2bSVaishali Kulkarni\silentfunc{eth_tx_queue_stop}
910*14b24e2bSVaishali Kulkarni\silentfunc{eth_rx_queue_stop}
911*14b24e2bSVaishali Kulkarni\silentfunc{sp_vport_stop}
912*14b24e2bSVaishali Kulkarni\begin{enumerate}
913*14b24e2bSVaishali Kulkarni	\item Disable the VPORT by calling \texttt{ecore\_vport\_update()} after setting:
914*14b24e2bSVaishali Kulkarni	\begin{enumerate}
915*14b24e2bSVaishali Kulkarni		\item \texttt{update\_vport\_active\_flg} to 1.
916*14b24e2bSVaishali Kulkarni		\item \texttt{vport\_active\_flg} to 0.
917*14b24e2bSVaishali Kulkarni	\end{enumerate}
918*14b24e2bSVaishali Kulkarni
919*14b24e2bSVaishali Kulkarni	\item Close all Tx queues\footnote{Actually, order does not matter between Tx and Rx queues}  by calling \myfunc{eth\_tx\_queue\_stop}{eth_tx_queue_stop}.
920*14b24e2bSVaishali Kulkarni
921*14b24e2bSVaishali Kulkarni	\item Close all Rx queues by \myfunc{eth\_rx\_queue\_stop}{eth_rx_queue_stop}.
922*14b24e2bSVaishali Kulkarni
923*14b24e2bSVaishali Kulkarni	\item Close the vport by calling \myfunc{sp\_vport\_stop}{sp_vport_stop}.
924*14b24e2bSVaishali Kulkarni\end{enumerate}
925*14b24e2bSVaishali Kulkarni
926*14b24e2bSVaishali KulkarniFollowing the completion of the \texttt{vport\_stop}, no further traffic should be working. Interrupts can be released, and resources can freed.
927*14b24e2bSVaishali KulkarniNotice this on its own doesn't return the device into a 'clean-slate' state; There are still several non-L2 things that needs to be done [e.g., cleaning the status blocks of the queues]
928*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
929*14b24e2bSVaishali Kulkarni
930*14b24e2bSVaishali Kulkarni\chapterimage{pictures/qlogic-full-36.jpg}
931*14b24e2bSVaishali Kulkarni\chapter{100G support}
932*14b24e2bSVaishali Kulkarni\label{cha:100}
933*14b24e2bSVaishali Kulkarni
934*14b24e2bSVaishali KulkarniOur device supports \myindex{100G} link. However, the fastpath pipeline of each HW engine isn't fast enough for that line-rate. The Hardware function term is a catchphrase for the HW resource and identifications normally required by a single pci function. In 100G mode, the device will enumerate as a single pci function\footnote{Or more in multi-function mode; But we will stick with single-function mode for simplicity here.}, but the driver running over this pci function will utilize multiple HW functions.
935*14b24e2bSVaishali KulkarniFrom pci standpoint, the distinction between the HW functions (and thus the HW engines) is done via the bar address. Access to the first half of each of the pci function's bars will be translated into an access into a HW function on the first engine, while access to the second half will be translated into an access into a HW function on the second engine.
936*14b24e2bSVaishali KulkarniFrom the wire standpoint, both HW-functions are connected to a single physical port, i.e. transmitting traffic from either HW-function will lead to transmission on the same physical port. Incoming traffic from the port is routed to a hardware engine according to its protocol 4-tuple. The HW block responsible for this routing it the \myindex{OPTE}.
937*14b24e2bSVaishali Kulkarni
938*14b24e2bSVaishali KulkarniThis special configuration is also sometimes referred to as \myindex{Couple Mode Teaming} or \myindex{CMT}.
939*14b24e2bSVaishali Kulkarni
940*14b24e2bSVaishali KulkarniAfter the early initialization phase of the ecore (i.e., following ecore\_hw\_prepare()), the \textit{ecore\_dev} field \myindex{num\_hwfns} will be filled with the correct number of HW-functions under the PCI device. The ecore and its client should access only the first num\_hwfns entries in the \textit{hwfns} array.
941*14b24e2bSVaishali Kulkarni
942*14b24e2bSVaishali Kulkarni\section{Effects on MSI-X interrupts}
943*14b24e2bSVaishali Kulkarni\label{sec:100int}
944*14b24e2bSVaishali KulkarniEach path has its own IGU CAM, meaning it has its own set of available status block. But as both HW-functions share the same PCI function, there is a single MSI-X table for that device.
945*14b24e2bSVaishali KulkarniAs a result, when in CMT the MSI-X vectors are split between the two hw-func/Incomtions.
946*14b24e2bSVaishali Kulkarni
947*14b24e2bSVaishali Kulkarni\begin{exampleT}
948*14b24e2bSVaishali Kulkarni\label{ex:CMT1}
949*14b24e2bSVaishali KulkarniAssume a PCI function is in CMT mode. Let $\text{hwfn}_0$ stand for its HW-function under the first engine and $\text{hwfn}_1$ stand for its HW-function under the second engine.
950*14b24e2bSVaishali KulkarniLet $\text{MSIX}_i$ stand for the $i^{th}$ entry in the PCI function's MSI-X table.
951*14b24e2bSVaishali Kulkarni
952*14b24e2bSVaishali KulkarniThen for $\forall n \in \mathbb{N}_{+}$, $\text{MSIX}_{2n}$ is connected to $\text{hwfn}_0$'s status block of index $n$, and $\text{MSIX}_{2n+1}$ is connected to $hwfn_1$'s status block of index $n$.
953*14b24e2bSVaishali Kulkarni\end{exampleT}
954*14b24e2bSVaishali Kulkarni
955*14b24e2bSVaishali Kulkarni\section{Effects on device slowpath configuration}
956*14b24e2bSVaishali KulkarniEcore handles almost all the difference between CMT and regular mode on it's own, i.e., it reads the number of HW-functions under the devices and iterates when needed to configure both engines correctly (where as in the non-CMT mode it would have simply configured one).
957*14b24e2bSVaishali KulkarniWhat it does require is:
958*14b24e2bSVaishali Kulkarni\begin{itemize}
959*14b24e2bSVaishali Kulkarni	\item Implement OSAL\_BAR\_SIZE. Ecore uses it to determine where it needs to split the bars; Without it it's very likely things will fail very early during initialization.
960*14b24e2bSVaishali Kulkarni
961*14b24e2bSVaishali Kulkarni	\item Set the HW-function's pf\_params for each HW-function before calling \textit{ecore\_resc\_alloc}.
962*14b24e2bSVaishali Kulkarni
963*14b24e2bSVaishali Kulkarni	\item Enable slowpath interrupts -- the first 2 MSI-X vectors should be used for slowpath. Notice that the ecore itself will call OSAL\_DPC\_INIT for each HW-function.
964*14b24e2bSVaishali Kulkarni	\begin {exampleT}
965*14b24e2bSVaishali Kulkarni		following Example [\ref{ex:CMT1}], $\text{MSIX}_0$ should be enabled and connected to the DPC of $\text{hwfn}_0$ and $\text{MSIX}_1$ should be enabled and connected to the DPC of $\text{hwfn}_1$.
966*14b24e2bSVaishali Kulkarni	\end{exampleT}
967*14b24e2bSVaishali Kulkarni\end{itemize}
968*14b24e2bSVaishali Kulkarni
969*14b24e2bSVaishali KulkarniWhen disabling the slowpath, it's important to remember that there were 2 different DPCs allocated and 2 MSI-X vectors configured to support them, as it's the ecore-client responsibility for disabling the interrupts.
970*14b24e2bSVaishali Kulkarni
971*14b24e2bSVaishali Kulkarni\section{Effects on L2 fastpath configuration}
972*14b24e2bSVaishali KulkarniSince each HW-function is running on a different path and is an independent entity (as perceived by FW/HW), configuration should be almost symmetric for both HW-functions. E.g., Following the flow of section \ref{sec:l2-start}, ecore\_sp\_vport\_start() should be called separately for each HW-function, queues should be opened separately for each, etc..
973*14b24e2bSVaishali Kulkarni
974*14b24e2bSVaishali KulkarniNotice that in most cases you can even use the same indices, since FW-indices are per-path. E.g., you can use $\text{vport}_0$ on both HW-functions, since they are different on each path.
975*14b24e2bSVaishali Kulkarni
976*14b24e2bSVaishali Kulkarni\begin{warning}
977*14b24e2bSVaishali Kulkarni	When allocating the status blocks for your queues, do recall that the MSI-X table is shared between the engines, as explained in section [\ref{sec:100int}].
978*14b24e2bSVaishali Kulkarni\end{warning}
979*14b24e2bSVaishali Kulkarni
980*14b24e2bSVaishali Kulkarni\begin{NOTICE}
981*14b24e2bSVaishali Kulkarni	There is an issue between the user control of the number of queues and the actual configuration of queues - e.g., assume user wants $X$ queues. If we use a symmetric configuration what we actually do is open $X$ queues on each path, meaning we actually open $2X$ queues.
982*14b24e2bSVaishali Kulkarni
983*14b24e2bSVaishali Kulkarni	We can either only open $X/2$ queues on each engine, in which case we lose some abilities, e.g., control the keys of the RSS hash-function, or open $2X$ queues and try to hide this fact from user, but this most likely will either incur a performance penalty, hard-to-maintain code or both.
984*14b24e2bSVaishali Kulkarni\end{NOTICE}
985*14b24e2bSVaishali Kulkarni
986*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
987*14b24e2bSVaishali Kulkarni\chapter{iSCSI protocol}
988*14b24e2bSVaishali Kulkarni\label{cha:iscsi}
989*14b24e2bSVaishali Kulkarni
990*14b24e2bSVaishali KulkarniThis chapter describes the ecore interface for the upper-layer driver of the iSCSI protocol.
991*14b24e2bSVaishali Kulkarni
992*14b24e2bSVaishali Kulkarni\section{Start iSCSI PF}
993*14b24e2bSVaishali Kulkarni\silentfunc{sp_iscsi_func_start}
994*14b24e2bSVaishali Kulkarni\silentfunc{iscsi_get_global_cmdq_cons}
995*14b24e2bSVaishali Kulkarni\begin{itemize}
996*14b24e2bSVaishali Kulkarni	\item The basic initialization process is described in section \ref{sec:init-init} for all protocols. \\
997*14b24e2bSVaishali Kulkarni	Specifically for iSCSI, before calling \texttt{ecore\_resc\_alloc()}, the upper driver should determine the PF-global parameters, allocate all PF-global queues, and fill the \texttt{iscsi\_pf\_params} part in struct \texttt{ecore\_pf\_params}. \\
998*14b24e2bSVaishali Kulkarni	The following table describes the parameters that should be filled (the rest should be zero):
999*14b24e2bSVaishali Kulkarni	\begin{center}
1000*14b24e2bSVaishali Kulkarni		\begin{tabular}{| l | p{10cm} |}
1001*14b24e2bSVaishali Kulkarni		\hline
1002*14b24e2bSVaishali Kulkarni		\textbf{Parameter} & \textbf{Description} \\ \hline
1003*14b24e2bSVaishali Kulkarni		\texttt{num\_cons} & Up to 4K are supported, suggested default value 128 \\ \hline
1004*14b24e2bSVaishali Kulkarni		\texttt{num\_tasks} & Up to 4K are supported, suggested default value 1K \\ \hline
1005*14b24e2bSVaishali Kulkarni		\texttt{half\_way\_close\_timeout} & Timeout from sending FIN until abortive close, suggested default value 10sec \\ \hline
1006*14b24e2bSVaishali Kulkarni		\texttt{num\_sq\_pages\_in\_ring} & Number of outstanding tasks on the connection * 8B / page-size. \newline Suggested default value for number of outstanding tasks on the connection 256 \\ \hline
1007*14b24e2bSVaishali Kulkarni		\texttt{num\_r2tq\_pages\_in\_ring} & Same as \texttt{num\_sq\_pages\_in\_ring} \\ \hline
1008*14b24e2bSVaishali Kulkarni		\texttt{num\_uhq\_pages\_in\_ring} & Number of outstanding un-ACKed PDUs, suggested default value -- same as \texttt{num\_sq\_pages\_in\_ring} \\ \hline
1009*14b24e2bSVaishali Kulkarni		\texttt{num\_queues} & Number of global queues (CQ / CmdQ / RQ). \newline This should be $\leq$ number of available MSIX vectors for the PF \\ \hline
1010*14b24e2bSVaishali Kulkarni		\texttt{log\_page\_size} & 12 for 4KB pages \\ \hline
1011*14b24e2bSVaishali Kulkarni		\texttt{glbl\_q\_params\_addr} & The physical address of the list of pointers to the arrays of pointers to global queues pages. \newline The list is built as follows: CQ\#0 PBL pointer, RQ\#0 PBL pointer, CmdQ\#0 PBL pointer, CQ\#1 PBL pointer, RQ\#1 PBL pointer, CmdQ\#1 PBL pointer, etc. \newline Each PBL pointer points to the physical address which contains an array of pointers to the physical addresses of the specific queue pages. \\ \hline
1012*14b24e2bSVaishali Kulkarni		\texttt{rqe\_log\_size} & 8 for 256B RQE \\ \hline
1013*14b24e2bSVaishali Kulkarni		\texttt{rq\_num\_entries} & Number of RQ entries, suggested value for Initiator 16 (4KB RQ), for Target 128 \\ \hline
1014*14b24e2bSVaishali Kulkarni		\texttt{cq\_num\_entries} & \texttt{num\_tasks} + \texttt{rq\_num\_entries} \\ \hline
1015*14b24e2bSVaishali Kulkarni		\texttt{cmdq\_num\_entries} & Number of CmdQ entries, suggested default value \texttt{num\_tasks} \\ \hline
1016*14b24e2bSVaishali Kulkarni		\texttt{max\_cwnd} & Max congestion window, suggested default value 0xFFFFFFFF (no limit) \\ \hline
1017*14b24e2bSVaishali Kulkarni		\texttt{dup\_ack\_threshold} & Dup-ACK counter, suggested default value 3 \\ \hline
1018*14b24e2bSVaishali Kulkarni		\texttt{max\_fin\_rt} & Number of FIN retransmits before abortive close, suggested default value 3 \\ \hline
1019*14b24e2bSVaishali Kulkarni		\texttt{gl\_rq\_pi} & The index in the status-block for CQ completions, suggested value 0 \\ \hline
1020*14b24e2bSVaishali Kulkarni		\texttt{gl\_cmd\_pi} & The index in the status-block for CmdQ completions, suggested value 1 \\ \hline
1021*14b24e2bSVaishali Kulkarni		\end{tabular}
1022*14b24e2bSVaishali Kulkarni	\end{center}
1023*14b24e2bSVaishali Kulkarni	\item After the basic initialization process is completed successfully, it is possible to establish the LL2 queue, and send / receive LL2 packets (as described in section \ref{cha:ll2}).
1024*14b24e2bSVaishali Kulkarni	\item \myfunc{sp\_iscsi\_func\_start}{sp_iscsi_func_start} -- this function initializes the iSCSI PF, and passes PF-global parameters to FW. This function should be called before offloading any iSCSI connection.
1025*14b24e2bSVaishali Kulkarni	\item \myfunc{iscsi\_get\_global\_cmdq\_cons}{iscsi_get_global_cmdq_cons} -- this function returns the address in the device for updating RQ CONS for the specified queue.
1026*14b24e2bSVaishali Kulkarni\end{itemize}
1027*14b24e2bSVaishali Kulkarni
1028*14b24e2bSVaishali Kulkarni\section{Establish iSCSI connection}
1029*14b24e2bSVaishali Kulkarni\silentfunc{iscsi_acquire_connection}
1030*14b24e2bSVaishali Kulkarni\silentfunc{iscsi_offload_connection}
1031*14b24e2bSVaishali Kulkarni\silentfunc{iscsi_get_db_addr}
1032*14b24e2bSVaishali Kulkarni\silentfunc{iscsi_update_connection}
1033*14b24e2bSVaishali Kulkarni\begin{itemize}
1034*14b24e2bSVaishali Kulkarni	\item \myfunc{iscsi\_acquire\_connection}{iscsi_acquire_connection} -- this function allocates the resources for the connection. \texttt{p\_in\_conn} which is passed to this function should be NULL. Note that ecore allocates by itself struct \texttt{ecore\_iscsi\_conn}, and returns its pointer to the upper driver via \texttt{p\_out\_conn}. Amongst others, ecore initializes in this struct the \texttt{icid} to be used in later task initialization, and the \texttt{conn\_id} which is zero based index.
1035*14b24e2bSVaishali Kulkarni	\item \myfunc{iscsi\_offload\_connection}{iscsi_offload_connection} -- this function offloads the connection to the device, and requests to establish the TCP connection. Before calling this function, the upper driver should determine the connection TCP parameters, allocate the connection SQ, and fill parameters in \texttt{ecore\_iscsi\_conn} struct. \\
1036*14b24e2bSVaishali Kulkarni	The following table describes the parameters that should be filled:
1037*14b24e2bSVaishali Kulkarni	\begin{center}
1038*14b24e2bSVaishali Kulkarni		\begin{tabular}{| l | p{10cm} |}
1039*14b24e2bSVaishali Kulkarni		\hline
1040*14b24e2bSVaishali Kulkarni		\textbf{Parameter} & \textbf{Description} \\ \hline
1041*14b24e2bSVaishali Kulkarni		\texttt{tcp\_on\_chip\_1b} & 1 \\ \hline
1042*14b24e2bSVaishali Kulkarni		\texttt{sq\_pbl\_addr} & The physical address of the array of pointers to the physical addresses of the SQ pages \\ \hline
1043*14b24e2bSVaishali Kulkarni		\texttt{local\_mac} & Local MAC address \\ \hline
1044*14b24e2bSVaishali Kulkarni		\texttt{remote\_mac} & Remote MAC address \\ \hline
1045*14b24e2bSVaishali Kulkarni		\texttt{vlan\_id} & VLAN ID \\ \hline
1046*14b24e2bSVaishali Kulkarni		\texttt{flags} & TS\_EN (timestamp enable) -- suggested default value 1 \newline DA\_EN (delayed-ACK enable) -- suggested default value 1 \newline DA\_CNT\_EN (delayed-ACK counter enable) -- suggested default value 1 \newline SACK\_EN (SACK enable) -- NA (not supported for iSCSI) \newline KA\_EN (keep-alive enable) -- suggested default value 1 \newline NAGLE\_EN (nagle enable) = NA (not supported for iSCSI) \newline FIN\_SENT – should be 0 \newline FIN\_RECEIVED -- should be 0 \\ \hline
1047*14b24e2bSVaishali Kulkarni		\texttt{ip\_version} & IP version \\ \hline
1048*14b24e2bSVaishali Kulkarni		\texttt{remote\_ip} & Remote IP address \\ \hline
1049*14b24e2bSVaishali Kulkarni		\texttt{local\_ip} & Local IP address \\ \hline
1050*14b24e2bSVaishali Kulkarni		\texttt{ka\_max\_probe\_cnt} & Number of keep-alive probe retransmits before indicating connection error, suggested default value 10 \\ \hline
1051*14b24e2bSVaishali Kulkarni		\texttt{dup\_ack\_theshold} & Dup-ACK counter, suggested default value 3 \\ \hline
1052*14b24e2bSVaishali Kulkarni		\texttt{rcv\_next} & In passive-open, SYN sequence number + 1. NA in active open \\ \hline
1053*14b24e2bSVaishali Kulkarni		\texttt{rcv\_wnd} & The window to advertise to the peer (before the scaling) \\ \hline
1054*14b24e2bSVaishali Kulkarni		\texttt{snd\_wl1} & In passive-open, SYN sequence number. NA in active open \\ \hline
1055*14b24e2bSVaishali Kulkarni		\texttt{cwnd} & Initial congestion window, suggested default value MSS \\ \hline
1056*14b24e2bSVaishali Kulkarni		\texttt{ss\_thresh} & Slow-start threshold, suggested default value 65535 \\ \hline
1057*14b24e2bSVaishali Kulkarni		\texttt{srtt} & Smoothed round-trip time, suggested default value 300 (300msec) \\ \hline
1058*14b24e2bSVaishali Kulkarni		\texttt{rtt\_var} & Round-trip time variation, suggested default value 150 (150msec) \\ \hline
1059*14b24e2bSVaishali Kulkarni		\texttt{ts\_recent} & In passive-open, the timestamp value in the SYN packet. NA in active open \\ \hline
1060*14b24e2bSVaishali Kulkarni		\texttt{flow\_label} & Flow label for IPv6, NA for IPv4 \\ \hline
1061*14b24e2bSVaishali Kulkarni		\texttt{ka\_timeout} & Timeout before the next KA after receiving ACK, suggested default value 7200000 (2 hours) \\ \hline
1062*14b24e2bSVaishali Kulkarni		\texttt{ka\_interval} & Timeout before the next KA after sending KA probe, suggested default value 10000 (10 sec) \\ \hline
1063*14b24e2bSVaishali Kulkarni		\texttt{max\_rt\_time} & Maximum retransmit time before indicating connection error, suggested default value 20sec \\ \hline
1064*14b24e2bSVaishali Kulkarni		\texttt{ttl} & Time-to-live for IPv4, hop-limit for IPv6 \\ \hline
1065*14b24e2bSVaishali Kulkarni		\texttt{tos\_or\_tc} & Type-of-service for IPv4, traffic-class for IPv6 \\ \hline
1066*14b24e2bSVaishali Kulkarni		\texttt{remote\_port} & Remote TCP port \\ \hline
1067*14b24e2bSVaishali Kulkarni		\texttt{local\_port} & Local TCP port \\ \hline
1068*14b24e2bSVaishali Kulkarni		\texttt{mss} & Maximum segment size \\ \hline
1069*14b24e2bSVaishali Kulkarni		\texttt{snd\_wnd\_scale} & In passive-open, taken from the TS-scale option in the received SYN packet, NA for active open \\ \hline
1070*14b24e2bSVaishali Kulkarni		\texttt{rcv\_wnd\_scale} & Receive window scale, suggested default value 4 \\ \hline
1071*14b24e2bSVaishali Kulkarni		\texttt{ts\_ticks\_per\_second} & Time-stamp resolution, suggested default value 1000 (1msec) \\ \hline
1072*14b24e2bSVaishali Kulkarni		\texttt{da\_timeout\_value} & Delayed-ACK timeout, suggested default value 200 (msec) \\ \hline
1073*14b24e2bSVaishali Kulkarni		\texttt{ack\_frequency} & Delayed-ACK counter, suggested default value 2 \\ \hline
1074*14b24e2bSVaishali Kulkarni		\texttt{default\_cq} & The desired queue number for completing un-solicited packets / commands \\ \hline
1075*14b24e2bSVaishali Kulkarni		\end{tabular}
1076*14b24e2bSVaishali Kulkarni	\end{center}
1077*14b24e2bSVaishali Kulkarni	When this call completes, the connection is offloaded and 3-way handshake started. 3-way handshake completion is indicated by an asynchronous call from ecore.
1078*14b24e2bSVaishali Kulkarni	After this call completes (and even before the asynchronous call), driver can post Login PDU to SQ. However FW will process SQ only after 3-way handshake is completed.
1079*14b24e2bSVaishali Kulkarni	\item \myfunc{iscsi\_get\_db\_addr}{iscsi_get_db_addr} -- this function returns the address in the device for updating SQ PROD for the specified CID.
1080*14b24e2bSVaishali Kulkarni	\item \myfunc{iscsi\_update\_connection}{iscsi_update_connection} -- this function sends updated iSCSI connection parameters to the device, after Login negotiation ended successfully. Before calling this function, the upper driver should fill parameters in \texttt{ecore\_iscsi\_conn} struct. The following table describes the parameters that should be filled:
1081*14b24e2bSVaishali Kulkarni	\begin{center}
1082*14b24e2bSVaishali Kulkarni		\begin{tabular}{| l | p{10cm} |}
1083*14b24e2bSVaishali Kulkarni		\hline
1084*14b24e2bSVaishali Kulkarni		\textbf{Parameter} & \textbf{Description} \\ \hline
1085*14b24e2bSVaishali Kulkarni		\texttt{update\_flag} & The negotiated values for HeaderDigest, DataDigest, InitialR2T and ImmediateData \\ \hline
1086*14b24e2bSVaishali Kulkarni		\texttt{max\_seq\_size} & The negotiated value for MaxBurstLength \\ \hline
1087*14b24e2bSVaishali Kulkarni		\texttt{max\_pdu\_size} & The negotiated value for MaxRecvDataSegmentLength \\ \hline
1088*14b24e2bSVaishali Kulkarni		\texttt{first\_seq\_length} & The negotiated value for FirstBurstLength \\ \hline
1089*14b24e2bSVaishali Kulkarni		\texttt{exp\_stat\_sn} & For Initiator, StatSN from the Login response + 1. NA for Target \\ \hline
1090*14b24e2bSVaishali Kulkarni		\end{tabular}
1091*14b24e2bSVaishali Kulkarni	\end{center}
1092*14b24e2bSVaishali Kulkarni\end{itemize}
1093*14b24e2bSVaishali Kulkarni
1094*14b24e2bSVaishali Kulkarni\section{Close iSCSI connection}
1095*14b24e2bSVaishali Kulkarni\silentfunc{iscsi_terminate_connection}
1096*14b24e2bSVaishali Kulkarni\silentfunc{iscsi_release_connection}
1097*14b24e2bSVaishali Kulkarni\begin{itemize}
1098*14b24e2bSVaishali Kulkarni	\item \myfunc{iscsi\_terminate\_connection}{iscsi_terminate_connection} -- this function removes the connection from the device, and requests to close the TCP connection. When this call completes, the connection closure state machine has started, but the connection is still offloaded. Connection closure and removal from the device is indicated by an asynchronous call from ecore.
1099*14b24e2bSVaishali Kulkarni	Before calling this function, driver needs to clean all outstanding tasks on the connection by sending cleanup requests via SQ. Clear-SQ / drain may be needed in exceptional cases. \\
1100*14b24e2bSVaishali Kulkarni	Logout PDU, if desired, should be posted to SQ before calling this function.
1101*14b24e2bSVaishali Kulkarni	\item \myfunc{iscsi\_release\_connection}{iscsi_release_connection} -- this function releases the resources for the connection. It should be called only after the asynchronous call from ecore on connection termination is received.
1102*14b24e2bSVaishali Kulkarni\end{itemize}
1103*14b24e2bSVaishali Kulkarni
1104*14b24e2bSVaishali Kulkarni\section{Close iSCSI PF}
1105*14b24e2bSVaishali Kulkarni\silentfunc{sp_iscsi_func_stop}
1106*14b24e2bSVaishali Kulkarni\begin{itemize}
1107*14b24e2bSVaishali Kulkarni	\item \myfunc{sp\_iscsi\_func\_stop}{sp_iscsi_func_stop} -- this function closes the iSCSI PF. This function should be called only after all the connections on the PF were closed.
1108*14b24e2bSVaishali Kulkarni	\item If an LL2 queue was established, it should be closed before continuing with the de-initialization process.
1109*14b24e2bSVaishali Kulkarni	\item The rest of the de-initialization process is described in section \ref{sec:init-de-init} for all protocols.
1110*14b24e2bSVaishali Kulkarni\end{itemize}
1111*14b24e2bSVaishali Kulkarni
1112*14b24e2bSVaishali Kulkarni\section{Getting statistics}
1113*14b24e2bSVaishali Kulkarni\myfunc{iscsi\_get\_stats}{iscsi_get_stats} can be used to query the device for various protocol-related statistics.
1114*14b24e2bSVaishali Kulkarni
1115*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
1116*14b24e2bSVaishali Kulkarni
1117*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
1118*14b24e2bSVaishali Kulkarni\chapter{FCoE protocol}
1119*14b24e2bSVaishali Kulkarni\label{cha:fcoe}
1120*14b24e2bSVaishali Kulkarni
1121*14b24e2bSVaishali KulkarniThis chapter describes the ecore interface for the upper-layer driver of the FCoE protocol.
1122*14b24e2bSVaishali Kulkarni
1123*14b24e2bSVaishali Kulkarni\section{Start FCoE PF}
1124*14b24e2bSVaishali Kulkarni\silentfunc{sp_fcoe_func_start}
1125*14b24e2bSVaishali Kulkarni\silentfunc{fcoe_get_global_cmdq_cons}
1126*14b24e2bSVaishali Kulkarni\begin{itemize}
1127*14b24e2bSVaishali Kulkarni	\item The initialization process is described in section \ref{sec:init-init} for all protocols. \\
1128*14b24e2bSVaishali Kulkarni	Specifically for FCoE, before calling \texttt{ecore\_resc\_alloc()}, the upper driver should determine the PF-global parameters, allocate all PF-global queues, and fill the \texttt{fcoe\_pf\_params} part in struct \texttt{ecore\_pf\_params}. \\
1129*14b24e2bSVaishali Kulkarni	The following table describes the parameters that should be filled (the rest should be zero):
1130*14b24e2bSVaishali Kulkarni	\begin{center}
1131*14b24e2bSVaishali Kulkarni		\begin{tabular}{| l | p{10cm} |}
1132*14b24e2bSVaishali Kulkarni		\hline
1133*14b24e2bSVaishali Kulkarni		\textbf{Parameter} & \textbf{Description} \\ \hline
1134*14b24e2bSVaishali Kulkarni		\texttt{num\_cons} & Up to 4K are supported, suggested default value 128 \\ \hline
1135*14b24e2bSVaishali Kulkarni		\texttt{num\_tasks} & Up to 4K are supported, suggested default value 1K \\ \hline
1136*14b24e2bSVaishali Kulkarni		\texttt{glbl\_q\_params\_addr} & The physical address of the list of pointers to the arrays of pointers to global queues pages. \newline The list is built as follows: CQ\#0 PBL pointer, RQ\#0 PBL pointer, CmdQ\#0 PBL pointer, CQ\#1 PBL pointer, RQ\#1 PBL pointer, CmdQ\#1 PBL pointer, etc. \newline Each PBL pointer points to the physical address which contains an array of pointers to the physical addresses of the specific queue pages. \\ \hline
1137*14b24e2bSVaishali Kulkarni		\texttt{sq\_num\_pbl\_pages} & Number of outstanding tasks on the connection * 8B / page-size. \newline Suggested default value for number of outstanding tasks on the connection 256 \\ \hline
1138*14b24e2bSVaishali Kulkarni		\texttt{rq\_num\_entries} & Number of RQ entries, suggested value for Initiator 16 (4KB RQ), for Target 128 \\ \hline
1139*14b24e2bSVaishali Kulkarni		\texttt{cq\_num\_entries} & \texttt{num\_tasks} + \texttt{rq\_num\_entries} \\ \hline
1140*14b24e2bSVaishali Kulkarni		\texttt{cmdq\_num\_entries} & Number of CmdQ entries, suggested value \texttt{num\_tasks} \\ \hline
1141*14b24e2bSVaishali Kulkarni		\texttt{rq\_buffer\_log\_size} & 8 for 256B RQE \\ \hline
1142*14b24e2bSVaishali Kulkarni		\texttt{num\_cqs} & Number of global queues (CQ / CmdQ / RQ). This should be $\leq$ number of available MSIX vectors for the PF \\ \hline
1143*14b24e2bSVaishali Kulkarni		\texttt{log\_page\_size} & 12 for 4KB pages \\ \hline
1144*14b24e2bSVaishali Kulkarni		\texttt{mtu} & Ethernet maximum transmission unit \\ \hline
1145*14b24e2bSVaishali Kulkarni		\texttt{gl\_rq\_pi} & The index in the status-block for CQ completions, suggested value 0 \\ \hline
1146*14b24e2bSVaishali Kulkarni		\texttt{gl\_cmd\_pi} & The index in the status-block for CmdQ completions, suggested value 1 \\ \hline
1147*14b24e2bSVaishali Kulkarni		\end{tabular}
1148*14b24e2bSVaishali Kulkarni	\end{center}
1149*14b24e2bSVaishali Kulkarni	\item After the basic initialization process is completed successfully, it is possible to establish the LL2 queue, and send / receive LL2 packets.
1150*14b24e2bSVaishali Kulkarni	\item \myfunc{sp\_fcoe\_func\_start}{sp_fcoe_func_start} -- this function initializes the FCoE PF, and passes PF-global parameters to FW. This function should be called before offloading any FCoE connection.
1151*14b24e2bSVaishali Kulkarni	\item \myfunc{fcoe\_get\_global\_cmdq\_cons}{fcoe_get_global_cmdq_cons} -- this function returns the address in the device for updating RQ CONS for the specified queue.
1152*14b24e2bSVaishali Kulkarni\end{itemize}
1153*14b24e2bSVaishali Kulkarni
1154*14b24e2bSVaishali Kulkarni\section{Establish FCoE connection}
1155*14b24e2bSVaishali Kulkarni\silentfunc{fcoe_acquire_connection}
1156*14b24e2bSVaishali Kulkarni\silentfunc{fcoe_offload_connection}
1157*14b24e2bSVaishali Kulkarni\silentfunc{fcoe_get_db_addr}
1158*14b24e2bSVaishali Kulkarni\begin{itemize}
1159*14b24e2bSVaishali Kulkarni	\item \myfunc{fcoe\_acquire\_connection}{fcoe_acquire_connection} -- this function allocates the resources for the connection. \texttt{p\_in\_conn} which is passed to this function should be NULL. Note that ecore allocates by itself struct \texttt{ecore\_fcoe\_conn}, and returns its pointer to the upper driver via \texttt{p\_out\_conn}. Amongst others, ecore initializes in this struct the \texttt{icid} to be used in later task initialization, and the \texttt{conn\_id} which is zero based index.
1160*14b24e2bSVaishali Kulkarni	\item \myfunc{fcoe\_offload\_connection}{fcoe_offload_connection} -- this function offloads the connection to the device. Before calling this function, the upper driver should allocate the connection SQ, and fill parameters in \texttt{ecore\_fcoe\_conn} struct. \\
1161*14b24e2bSVaishali Kulkarni	The following table describes the parameters that should be filled:
1162*14b24e2bSVaishali Kulkarni	\begin{center}
1163*14b24e2bSVaishali Kulkarni		\begin{tabular}{| l | p{10cm} |}
1164*14b24e2bSVaishali Kulkarni		\hline
1165*14b24e2bSVaishali Kulkarni		\textbf{Parameter} & \textbf{Description} \\ \hline
1166*14b24e2bSVaishali Kulkarni		\texttt{sq\_pbl\_addr} & The physical address of the array of pointers to the physical addresses of the SQ pages \\ \hline
1167*14b24e2bSVaishali Kulkarni		\texttt{sq\_curr\_page\_addr} & The physical address of the first SQ page \\ \hline
1168*14b24e2bSVaishali Kulkarni		\texttt{sq\_next\_page\_addr} & The physical address of the second SQ page (or the first in case of a single page SQ) \\ \hline
1169*14b24e2bSVaishali Kulkarni		\texttt{dst\_mac\_addr\_lo} & Remote MAC address – 2 LSB bytes \\ \hline
1170*14b24e2bSVaishali Kulkarni		\texttt{dst\_mac\_addr\_mid} & Remote MAC address – 2 middle bytes \\ \hline
1171*14b24e2bSVaishali Kulkarni		\texttt{dst\_mac\_addr\_hi} & Remote MAC address – 2 MSB bytes \\ \hline
1172*14b24e2bSVaishali Kulkarni		\texttt{src\_mac\_addr\_lo} & Local MAC address – 2 LSB bytes \\ \hline
1173*14b24e2bSVaishali Kulkarni		\texttt{src\_mac\_addr\_mid} & Local MAC address – 2 middle bytes \\ \hline
1174*14b24e2bSVaishali Kulkarni		\texttt{src\_mac\_addr\_hi} & Local MAC address – 2 MSB bytes \\ \hline
1175*14b24e2bSVaishali Kulkarni		\texttt{tx\_max\_fc\_pay\_len} & The maximum FC payload size to transmit \\ \hline
1176*14b24e2bSVaishali Kulkarni		\texttt{e\_d\_tov\_timer\_val} & E\_D\_TOV timeout value in 1msec resolution \\ \hline
1177*14b24e2bSVaishali Kulkarni		\texttt{rec\_tov\_timer\_val} & REC\_TOV timeout value in 1msec resolution \\ \hline
1178*14b24e2bSVaishali Kulkarni		\texttt{rx\_max\_fc\_pay\_len} & The maximum FC payload size to receive \\ \hline
1179*14b24e2bSVaishali Kulkarni		\texttt{vlan\_tag} & VLAN ID \\ \hline
1180*14b24e2bSVaishali Kulkarni		\texttt{s\_id} & FC Source ID \\ \hline
1181*14b24e2bSVaishali Kulkarni		\texttt{max\_conc\_seqs\_c3} & Max concurrent sequences \\ \hline
1182*14b24e2bSVaishali Kulkarni		\texttt{d\_id} & FC destination ID \\ \hline
1183*14b24e2bSVaishali Kulkarni		\texttt{flags} & INCR\_SEQ\_CNT (continuously increasing SEQ\_CNT on receive) \newline CONF\_REQ (confirmation request supported) \newline REC\_VALID (REC timeout supported) \newline VLAN\_FLAG (indicates if the VLAN ID is valid) \\ \hline
1184*14b24e2bSVaishali Kulkarni		\texttt{def\_q\_idx} & The desired queue number for completing un-solicited packets / commands \\ \hline
1185*14b24e2bSVaishali Kulkarni		\end{tabular}
1186*14b24e2bSVaishali Kulkarni	\end{center}
1187*14b24e2bSVaishali Kulkarni	After this call completes driver can post tasks to SQ.
1188*14b24e2bSVaishali Kulkarni	\item \myfunc{fcoe\_get\_db\_addr}{fcoe_get_db_addr} -- this function returns the address in the device for updating SQ PROD for the specified CID.
1189*14b24e2bSVaishali Kulkarni\end{itemize}
1190*14b24e2bSVaishali Kulkarni
1191*14b24e2bSVaishali Kulkarni\section{Close FCoE connection}
1192*14b24e2bSVaishali Kulkarni\silentfunc{fcoe_terminate_connection}
1193*14b24e2bSVaishali Kulkarni\silentfunc{fcoe_release_connection}
1194*14b24e2bSVaishali Kulkarni\begin{itemize}
1195*14b24e2bSVaishali Kulkarni	\item \myfunc{fcoe\_terminate\_connection}{fcoe_terminate_connection} -- this function removes the connection from the device. \\
1196*14b24e2bSVaishali Kulkarni	Before calling this function, driver needs to clean all outstanding tasks on the connection by sending cleanup requests via SQ. Drain may be needed in exceptional cases.
1197*14b24e2bSVaishali Kulkarni	\item \myfunc{fcoe\_release\_connection}{fcoe_release_connection} -- this function releases the resources for the connection.
1198*14b24e2bSVaishali Kulkarni\end{itemize}
1199*14b24e2bSVaishali Kulkarni
1200*14b24e2bSVaishali Kulkarni\section{Close FCoE PF}
1201*14b24e2bSVaishali Kulkarni\silentfunc{sp_fcoe_func_stop}
1202*14b24e2bSVaishali Kulkarni\begin{itemize}
1203*14b24e2bSVaishali Kulkarni	\item \myfunc{sp\_fcoe\_func\_stop}{sp_fcoe_func_stop} -- this function closes the FCoE PF. This function should be called only after all the connections on the PF were closed.
1204*14b24e2bSVaishali Kulkarni	\item If an LL2 queue was established, it should be closed before continuing with the de-initialization process.
1205*14b24e2bSVaishali Kulkarni	\item The rest of the de-initialization process is described in section \ref{sec:init-de-init} for all protocols.
1206*14b24e2bSVaishali Kulkarni\end{itemize}
1207*14b24e2bSVaishali Kulkarni
1208*14b24e2bSVaishali Kulkarni\section{Getting statistics}
1209*14b24e2bSVaishali Kulkarni\myfunc{fcoe\_get\_stats}{fcoe_get_stats} can be used to query the device for various protocol-related statistics.
1210*14b24e2bSVaishali Kulkarni
1211*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
1212*14b24e2bSVaishali Kulkarni
1213*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
1214*14b24e2bSVaishali Kulkarni\chapter{RDMA protocol}
1215*14b24e2bSVaishali Kulkarni\label{cha:rdma}
1216*14b24e2bSVaishali Kulkarni
1217*14b24e2bSVaishali KulkarniThis chapter describes the ecore interface for the upper-layer driver of the RDMA protocol. The interface aims at sharing as much as possible between RoCE and iWARP. This chapter is not complete, and currently only details changes for iWARP. (Except for dcqcn which was already detailed before )
1218*14b24e2bSVaishali KulkarniFor iwarp support, modification to existing structures and functions names will be made to ease distinction between the two. Similar to HSI changes. The following convention will be used:
1219*14b24e2bSVaishali Kulkarni\begin{itemize}
1220*14b24e2bSVaishali Kulkarni\item ecore\_rdma\_xxx will be used for common structures and functions
1221*14b24e2bSVaishali Kulkarni\item ecore\_roce\_xxx will be used for roce specific structures, fields and functions
1222*14b24e2bSVaishali Kulkarni\item ecore\_iwarp\_xxx will be used for iwarp specific structures, fields and functions
1223*14b24e2bSVaishali Kulkarni\end{itemize}
1224*14b24e2bSVaishali Kulkarni
1225*14b24e2bSVaishali Kulkarni\section{Distinguish between iWARP and RoCE}
1226*14b24e2bSVaishali KulkarniEcore per function context has a field “personality” which is set based on the protocol being iWARP/FCoE/iSCSI/RoCE/Ethernet. In context of network driver, ecore personality could be ECORE\_PCI\_ETH\_ROCE, ECORE\_PCI\_IWARP, or ECORE\_PCI\_ETH. (enum ecore\_pci\_personality)
1227*14b24e2bSVaishali KulkarniEcore provides the driver the ability to set the ecore personality through the call to ecore\_hw\_prepare by passing personality as a parameter. If ‘personality’ passed in call to ecore\_hw\_prepare is ECORE\_PCI\_DEFAULT the ‘personality’ is derived from the NVRAM configuration for protocol and device capability, else the setting passed by upper driver in the call overrides the NVRAM configuration.
1228*14b24e2bSVaishali KulkarniTBD: NVRAM configuration for distinguishing iWARP and RoCE does not exist and is not finalized yet.
1229*14b24e2bSVaishali Kulkarni
1230*14b24e2bSVaishali Kulkarni
1231*14b24e2bSVaishali Kulkarni\section{Init RDMA PF}
1232*14b24e2bSVaishali Kulkarni\silentfunc{rdma_start}
1233*14b24e2bSVaishali Kulkarni\silentfunc{rdma_add_user}
1234*14b24e2bSVaishali Kulkarni\silentfunc{rdma_query_device}
1235*14b24e2bSVaishali Kulkarni\silentfunc{rdma_get_sb_id}
1236*14b24e2bSVaishali Kulkarni\begin{itemize}
1237*14b24e2bSVaishali Kulkarni\item The initialization process is described in section \ref{sec:init-init} for all protocols. \\
1238*14b24e2bSVaishali Kulkarni	Specifically for RDMA, before calling \texttt{ecore\_resc\_alloc()}, the upper driver should determine the PF-global parameters, allocate all PF-global queues, and fill the \texttt{rdma\_pf\_params} part in struct \texttt{ecore\_pf\_params}. \\
1239*14b24e2bSVaishali Kulkarni	The following table describes the parameters that should be filled (the rest should be zero):
1240*14b24e2bSVaishali Kulkarni	\begin{center}
1241*14b24e2bSVaishali Kulkarni		\begin{tabular}{| l | p{10cm} |}
1242*14b24e2bSVaishali Kulkarni		\hline
1243*14b24e2bSVaishali Kulkarni		\textbf{Parameter} & \textbf{Description} \\ \hline
1244*14b24e2bSVaishali Kulkarni		\texttt{min\_dpis} & the minimum number of device contexts required (i.e. the number of times open device can be called simultaneously) \\ \hline
1245*14b24e2bSVaishali Kulkarni		\texttt{num\_qps} & requested number of queue pairs\\ \hline
1246*14b24e2bSVaishali Kulkarni		\texttt{edpm\_mode} & (0-Enable EDPM if BAR size is adequate, 1-Force EDPM (modprobe may fail on small BARs), 2-Disable EDPM) This field is relevant to RoCE only\\ \hline
1247*14b24e2bSVaishali Kulkarni		\texttt{num\_mrs} & Number of supported MRs. Up to 4K are supported, suggested default value 1K \\ \hline
1248*14b24e2bSVaishali Kulkarni		\texttt{gl\_pi} & The index in the status-block for CNQ completions, suggested value 3 (define named QED\_RDMA\_PROTOCOL\_INDEX exists)\\ \hline
1249*14b24e2bSVaishali Kulkarni		\texttt{roce\_enable\_dcqcn} & If enabled maximum number of rate limiters will be allocated during hardware initialization which can later be initialized and configured during roce start. Must be set to enabled dcqcn during roce initialization. This field is relevant to RoCE only.\\ \hline
1250*14b24e2bSVaishali Kulkarni		\texttt{iwarp\_port} & TCP port number to be used for the iwarp traffic\\ \hline
1251*14b24e2bSVaishali Kulkarni		\end{tabular}
1252*14b24e2bSVaishali Kulkarni	\end{center}
1253*14b24e2bSVaishali Kulkarni	The values of num\_qps, num\_mrs will impact the amount of memory allocated in the ILT. Note that although these parameters are rdma specific, they are actually used during common hw initialization phase. The amount of ilt memory will differ between RoCE and iWARP as iWARP requires only one cid per QP and RoCE requires two.
1254*14b24e2bSVaishali Kulkarni
1255*14b24e2bSVaishali Kulkarni	\item \myfunc{rdma\_start}{rdma_start} -- this function initializes the RDMA PF, allocates resources required for RDMA and passes PF-global parameters to FW. This function should be called before performing any other RDMA operations.
1256*14b24e2bSVaishali Kulkarni The following table describes the parameters that should be passed to the function:
1257*14b24e2bSVaishali Kulkarni	\begin{center}
1258*14b24e2bSVaishali Kulkarni		\begin{tabular}{| l | p{10cm} |}
1259*14b24e2bSVaishali Kulkarni		\hline
1260*14b24e2bSVaishali Kulkarni		\textbf{Parameter} & \textbf{Description} \\ \hline
1261*14b24e2bSVaishali Kulkarni		\texttt{events} & RoCE - callback functions for affiliated and unaffiliated events.\\ \hline
1262*14b24e2bSVaishali Kulkarni		\texttt{desired\_cnq} & desired number of cnqs to be used. Upper layer driver needs to make sure enough resources are available for this number (number of msix vectors and cnq resource\\ \hline
1263*14b24e2bSVaishali Kulkarni		\texttt{cnq\_pbl\_list} & Array of pbls used per cnq. The array should be initialized according to the value set in desired\_cnq\\ \hline
1264*14b24e2bSVaishali Kulkarni		\texttt{cq\_mode} & The CQ Mode affects the CQ doorbell transaction size. 64 or 32 bit machines should configure to 32 or 16 bits respectively.\\ \hline
1265*14b24e2bSVaishali Kulkarni		\texttt{roce\_dcqcn\_params} & relevant only if enable\_dcqcn was initialized to true in rdma\_pf\_params. Upper level driver needs to set appropriate fields. See dcqcn section below.\\ \hline
1266*14b24e2bSVaishali Kulkarni		\texttt{max\_mtu} & Required for iWARP ll2. Can give ethernet mtu.\\ \hline
1267*14b24e2bSVaishali Kulkarni		\texttt{mac\_addr} & Required for iWARP ll2. Should be primary mac used for RDMA.\\ \hline
1268*14b24e2bSVaishali Kulkarni		\texttt{iwarp\_flags} & TCP related flags that can be controlled by user.
1269*14b24e2bSVaishali Kulkarni		\begin{tabbing}
1270*14b24e2bSVaishali Kulkarni			TS\_EN: Timestamp enabled \\
1271*14b24e2bSVaishali Kulkarni			DA\_EN: Delayed ack enabled \\
1272*14b24e2bSVaishali Kulkarni		\end{tabbing} \\ \hline
1273*14b24e2bSVaishali Kulkarni		\texttt{iwarp\_crc\_needed} & Control whether CRC should be used. \\ \hline
1274*14b24e2bSVaishali Kulkarni		\texttt{iwarp\_rcv\_wnd\_size} & ecore will calculate the receive window scale from this. This number should be provided in bytes. There is a minimum of 64K, any number below this will result in the default window size being set which is 1MB \\ \hline
1275*14b24e2bSVaishali Kulkarni		\end{tabular}
1276*14b24e2bSVaishali Kulkarni	\end{center}
1277*14b24e2bSVaishali Kulkarni	\item \myfunc{rdma\_get\_sb\_id}{rdma_get_sb_id} -- this function returns the first status block id assigned for RDMA. This is required for initializing the RDMA status blocks using the function \textit{ecore\_int\_sb\_init()}.
1278*14b24e2bSVaishali Kulkarni	\item \myfunc{rdma\_add\_user}{rdma_add_user} -- this function allocates a dpi index for the client.During initialization, this function should be called to allocate a reserved dpi index for the kernel.
1279*14b24e2bSVaishali Kulkarni	\item \myfunc{rdma\_query\_device}{rdma_query_device} -- this function returns a struct of type ecore\_rdma\_device which contains the capabilities and set options for the given device.
1280*14b24e2bSVaishali Kulkarni\subsection{DCQCN}
1281*14b24e2bSVaishali KulkarniDCQCN is only relevant for RoCE.
1282*14b24e2bSVaishali Kulkarni\item Enable\_dcqcn under rdma\_pf\_params allocates additional hardware resources (rate limiters ) which can later be used to enable DCQCN notification point and reaction point. This must be set prior to calling \texttt{ecore\_resc\_alloc()}.
1283*14b24e2bSVaishali Kulkarni\item Additional parameters were added to ecore\_rdma\_start parameters to configure dcqcn. These will only be valid if the roce\_enable\_dcqcn in rdma\_pf\_params was set.
1284*14b24e2bSVaishali KulkarniNotification point and reaction point can be enabled independently.
1285*14b24e2bSVaishali KulkarniWhen configuring the device to act as notification point, the ecore will initialize the NIG block accordingly and pass the priority vlan and cnp send timeout values to FW. When configuring the device to act as reaction point, the ecore will send a ramrod to FW that configures the rate limiters allocated for dcqcn support with the values received from the upper layer driver ( such as maximum rate, byte counter limit, active increase rate etc... full detail in ecore\_roce\_api.h file ). At this point all rate limiters will be configured with the same values. If in the future there will be a need to configure different rate limiters with different values an additional API function will be provided. During initialization, ecore will map between physical queues used for RoCE and rate limiters. The number of rate limiters allocated is handled by resource management and is currently divided equally between the functions. During modify\_qp, ecore will configure the responder and requester to work with a unique physical queue, which is configured to work with a unique rate limiter. QPs that are opened after rate limiters are used out will be configured to run on a default physical queue which does not have a rate limiter. FW assumes that the qp\_id is equal to the physical queue id. For simplicity, the implementation assumes that Ethernet is not run simultaneously with RoCE (i.e. Roce only personality). If dcqcn is enabled and ethernet is run, ethernet will run on the same physical queue as the first qp that is allocated.
1286*14b24e2bSVaishali Kulkarni\end{itemize}
1287*14b24e2bSVaishali Kulkarni
1288*14b24e2bSVaishali Kulkarni\section{iWARP Connection Establishment}
1289*14b24e2bSVaishali KulkarniUnlike RoCE in which connection management is implemented completely in host, connection management for iWARP which involves the TCP 3 way handshake and MPA exchanges is implemented in  F/W. The host is nevertheless involved in offloading TCP and MPA and exchanging connection parameters as part of the connection establishment/teardown process.
1290*14b24e2bSVaishali Kulkarni\subsection{Ecore-upper driver connection establishment/teardown API for iWARP}
1291*14b24e2bSVaishali KulkarniDuring connection establishment/teardown, the driver calls ecore connection related APIs and receives callbacks from ecore for connection related events. The driver registers its event callbacks by passing them as parameters to the different connection ecore APIs.
1292*14b24e2bSVaishali Kulkarni
1293*14b24e2bSVaishali Kulkarni\subsection{Ecore APIs/functions for driver (downcalls)}
1294*14b24e2bSVaishali Kulkarni\begin{tabular}{| l | p{10cm} |}
1295*14b24e2bSVaishali Kulkarni	\hline
1296*14b24e2bSVaishali Kulkarni	\textbf{Ecore Functions} & \textbf{Description} \\ \hline
1297*14b24e2bSVaishali Kulkarni	\texttt{ecore\_iwarp\_connect} & Used during active connection establishment. Called to establish an iWARP connection with a peer. This is a non-blocking call. Once connection is established an async event will be sent to driver. \\ \hline
1298*14b24e2bSVaishali Kulkarni	\texttt{ecore\_iwarp\_create\_listen} & Used for passive connection. Called to start a listener. \\ \hline
1299*14b24e2bSVaishali Kulkarni	\texttt{ecore\_iwarp\_destroy\_listen} & Used for passive connection. Called to destroy a listener. \\ \hline
1300*14b24e2bSVaishali Kulkarni	\texttt{ecore\_iwarp\_accept} & Used during passive connection establishment. Called for accepting a previously received iWARP connection request event. i.e. MPA request event. Once connection is fully established an async event will be sent to driver. \\ \hline
1301*14b24e2bSVaishali Kulkarni
1302*14b24e2bSVaishali Kulkarni\end{tabular}
1303*14b24e2bSVaishali Kulkarni\subsection{Communication Management information}
1304*14b24e2bSVaishali Kulkarni\label{sec:cminfo}
1305*14b24e2bSVaishali KulkarniFor both passive and active connect, basic information on host and peer is required. We define a structure called \texttt{ecore\_iwarp\_cm\_info} which will be passed between driver and ecore on both downcalls and upcalls. Throughout the rest of the chapter we'll refer to this as the cm\_info.
1306*14b24e2bSVaishali Kulkarni\begin{tabular}{| l | p{10cm} |}
1307*14b24e2bSVaishali Kulkarni 	\hline
1308*14b24e2bSVaishali Kulkarni 	\textbf{Field} & \textbf{Description} \\ \hline
1309*14b24e2bSVaishali Kulkarni 	\texttt{ip\_version} & Type: enum ecore\_tcp\_ip\_version. Determines if ipv6 or ipv4 \\ \hline
1310*14b24e2bSVaishali Kulkarni 	\texttt{remote\_ip} & Ip address of remote side. In host byte order. \\ \hline
1311*14b24e2bSVaishali Kulkarni 	\texttt{local\_ip} & Ip address of local side. In host byte order. \\ \hline
1312*14b24e2bSVaishali Kulkarni 	\texttt{remote\_port} & Port of the remote side. In host byte order. \\ \hline
1313*14b24e2bSVaishali Kulkarni 	\texttt{local\_port} & Port of the local side. In host byte order. \\ \hline
1314*14b24e2bSVaishali Kulkarni 	\texttt{vlan} & vlan to be used. 0 value means no vlan. \\ \hline
1315*14b24e2bSVaishali Kulkarni 	\texttt{private\_data} & Used for MPA. This data will be sent on the MPA request. \\ \hline
1316*14b24e2bSVaishali Kulkarni 	\texttt{private\_data\_len} & Length of the private data. \\ \hline
1317*14b24e2bSVaishali Kulkarni 	\texttt{ord} & Negotiated in MPA Rev2. Used as is in MPA Rev1. \\ \hline
1318*14b24e2bSVaishali Kulkarni 	\texttt{ird} & Negotiated in MPA Rev2. Used as is in MPA Rev1. \\ \hline
1319*14b24e2bSVaishali Kulkarni\end{tabular}
1320*14b24e2bSVaishali KulkarniEcore sends same parameter for all different event types, though the required parameters for an event type is a subset of the complete set of parameters. It does this so that it can pass a pointer to a data structure instead of reformatting the parameters in a different structure.
1321*14b24e2bSVaishali Kulkarni
1322*14b24e2bSVaishali Kulkarni\subsection{Active side connection establishment}
1323*14b24e2bSVaishali KulkarniOn the active side of iWARP connection establishment, it is assumed that create\_qp will be called prior to calling connect. QP created will be sent as a parameter to the accept function call.
1324*14b24e2bSVaishali Kulkarni\subsubsection{ecore\_iwarp\_connect}
1325*14b24e2bSVaishali Kulkarni This function will take care of initiating the TCP 3-way handshake and MPA negotiation. Once the MPA response is received the event EVENT\_ACTIVE\_COMPLETE will be issued to upper-layer driver. This function is asynchronous. The function will receive cm\_info (detailed in \ref{sec:cminfo} ), mss, local and remote mac address. The mac address will be acquired by upper-layer driver using OS ip routing functions (such as find\_route in linux). In addition, it will require a pointer to the associated QP and a pointer to a callback function and callback context which will be used to indicate events to the driver which are related to this connection. \newline
1326*14b24e2bSVaishali Kulkarni \begin{tabular}{| l | p{10cm} |}
1327*14b24e2bSVaishali Kulkarni 	\hline
1328*14b24e2bSVaishali Kulkarni 	\textbf{Return Values} & \textbf{Description} \\ \hline
1329*14b24e2bSVaishali Kulkarni 	\texttt{ECORE\_NO\_MEM} & Memory is required for driver context of a connection. If it can't allocate it will return this failure. \\ \hline
1330*14b24e2bSVaishali Kulkarni 	\texttt{ECORE\_SUCCESS} & Means tcp offload was performed. Does not mean connection was established. The status of connection establishment will be passed with the EVENT\_ACTIVE\_COMPLETE. \\ \hline
1331*14b24e2bSVaishali Kulkarni\end{tabular}
1332*14b24e2bSVaishali Kulkarni\subsubsection{event callbacks related}
1333*14b24e2bSVaishali KulkarniThe callback received in connect call will be called with the following values after MPA response was received from peer: \newline
1334*14b24e2bSVaishali Kulkarni \begin{tabular}{| l | p{10cm} |}
1335*14b24e2bSVaishali Kulkarni 	\hline
1336*14b24e2bSVaishali Kulkarni 	\textbf{Field} & \textbf{Value} \\ \hline
1337*14b24e2bSVaishali Kulkarni 	\texttt{event} & \texttt{ECORE\_IWARP\_EVENT\_ACTIVE\_COMPLETE} \\ \hline
1338*14b24e2bSVaishali Kulkarni 	\texttt{cm\_info} & \ref{sec:cminfo} with finalized values. MPA Rev2 for example will contain the negotiated ird/ord \\ \hline
1339*14b24e2bSVaishali Kulkarni 	\texttt{ep\_context} & Dont Care for active side \\ \hline
1340*14b24e2bSVaishali Kulkarni 	\texttt{status} & ECORE\_SUCCESS if connection establishment was successful. ECORE\_TIMEOUT if connection timed out, ECORE\_CONNECTION\_REFUSED if mpa\_reject was received, ECORE\_CONN\_RESET if connection establishment failed for any other reason. \\ \hline
1341*14b24e2bSVaishali Kulkarni \end{tabular}
1342*14b24e2bSVaishali Kulkarni
1343*14b24e2bSVaishali Kulkarni\subsection{Passive side connection establishment}
1344*14b24e2bSVaishali KulkarniThe ecore will use the ll2 interface for implementing passive side connection establishment. Upper layer driver will send 2\-tuples and vlan to ecore layer which the ecore should listen on. Once a SYN packet is received on the ll2 interface, the ecore will search its database to check if a listener was registered with the received 2\-tuple and vlan. If it was received, tcp offload ramrod will be sent and once the MPA request will be received, the event EVENT\_MPA\_REQUEST will be issued to upper layer driver. At this stage it is expected that the upper layer driver will pass the MPA parameters such as private data, ord, ird to all the way to user app, which will in turn create a QP and related objects and later issue a call to ecore\_iwarp\_accept.
1345*14b24e2bSVaishali Kulkarni\subsubsection{iwarp\_create\_listen}
1346*14b24e2bSVaishali KulkarniThis function will receive socket local and remote addresses (port, ip and vlan) and add them to its listening database. In addition a callback function and callback context will be provided which will be used by ecore to send events of connection requests to the driver.
1347*14b24e2bSVaishali Kulkarni
1348*14b24e2bSVaishali Kulkarni\subsubsection{event callbacks related}
1349*14b24e2bSVaishali KulkarniThe callback received in listen call will be called with the following values after MPA request was received from network: \newline
1350*14b24e2bSVaishali Kulkarni\begin{tabular}{| l | p{10cm} |}
1351*14b24e2bSVaishali Kulkarni	\hline
1352*14b24e2bSVaishali Kulkarni	\textbf{Field} & \textbf{Value} \\ \hline
1353*14b24e2bSVaishali Kulkarni	\texttt{event} & \texttt{ECORE\_IWARP\_EVENT\_MPA\_REQUEST} \\ \hline
1354*14b24e2bSVaishali Kulkarni	\texttt{cm\_info} & \ref{sec:cminfo} with values received on the MPA request. \\ \hline
1355*14b24e2bSVaishali Kulkarni	\texttt{ep\_context} & Should be sent down to ecore during call to ecore\_iwarp\_accept. \\ \hline
1356*14b24e2bSVaishali Kulkarni	\texttt{status} & Dont Care for this event. \\ \hline
1357*14b24e2bSVaishali Kulkarni\end{tabular}
1358*14b24e2bSVaishali Kulkarni
1359*14b24e2bSVaishali Kulkarni\subsubsection{ecore\_iwarp\_accept}
1360*14b24e2bSVaishali KulkarniThis function should be called when upper layer driver wants to accept a request issued by ecore's EVENT\_MPA\_REQUEST. Qp should have been created during before calling this function. This function will send the MPA ramrod which will send a MPA response. Once the ACK on the MPA response is received the event EVENT\_PASSIVE\_COMPLETE will be sent to upper layer driver.
1361*14b24e2bSVaishali Kulkarni\begin{center}
1362*14b24e2bSVaishali Kulkarni		\begin{tabular}{| l | p{10cm} |}
1363*14b24e2bSVaishali Kulkarni		\hline
1364*14b24e2bSVaishali Kulkarni		\textbf{Param} & \textbf{Description}\\ \hline
1365*14b24e2bSVaishali Kulkarni		\texttt{ep\_context} & The ep\_context received in the MPA\_REQUEST event\\ \hline
1366*14b24e2bSVaishali Kulkarni		\texttt{cb\_context} & The same callback function passed in create\_listen will be used but with this new cb\_context.\\ \hline
1367*14b24e2bSVaishali Kulkarni		\texttt{private\_data} & attach to MPA frame.\\ \hline
1368*14b24e2bSVaishali Kulkarni		\texttt{private\_data\_len} & length of private data.\\ \hline
1369*14b24e2bSVaishali Kulkarni		\texttt{ord} & to send on MPA response. \\ \hline
1370*14b24e2bSVaishali Kulkarni		\texttt{ird} & to send on MPA response. \\ \hline
1371*14b24e2bSVaishali Kulkarni		\texttt{qp} & QP associated with this connection.\\ \hline
1372*14b24e2bSVaishali Kulkarni		\end{tabular}
1373*14b24e2bSVaishali Kulkarni\end{center}
1374*14b24e2bSVaishali Kulkarni
1375*14b24e2bSVaishali Kulkarni\subsubsection{ecore\_iwarp\_reject}
1376*14b24e2bSVaishali KulkarniThis function should be called when upper layer driver / App wants to reject a connection request for whatever reason. As a result of ecore's EVENT\_MPA\_REQUEST.
1377*14b24e2bSVaishali KulkarniIf a connection is rejected QP will not be associated with the connection request and remains an independent object ( if it was created ). Calling this function
1378*14b24e2bSVaishali Kulkarniwill result in an MPA response being sent to peer with the 'reject' flag being turned on. EVENT\_PASSIVE\_COMPLETE will be sent to upper layer driver with status
1379*14b24e2bSVaishali Kulkarnicode CONNECTION\_REFUSED.
1380*14b24e2bSVaishali Kulkarni\begin{center}
1381*14b24e2bSVaishali Kulkarni	\begin{tabular}{| l | p{10cm} |}
1382*14b24e2bSVaishali Kulkarni		\hline
1383*14b24e2bSVaishali Kulkarni		\textbf{Param} & \textbf{Description}\\ \hline
1384*14b24e2bSVaishali Kulkarni		\texttt{ep\_context} & The ep\_context received in the MPA\_REQUEST event\\ \hline
1385*14b24e2bSVaishali Kulkarni		\texttt{cb\_context} & The same callback function passed in create\_listen will be used but with this new cb\_context.\\ \hline
1386*14b24e2bSVaishali Kulkarni		\texttt{private\_data} & attach to MPA frame.\\ \hline
1387*14b24e2bSVaishali Kulkarni		\texttt{private\_data\_len} & length of private data.\\ \hline
1388*14b24e2bSVaishali Kulkarni	\end{tabular}
1389*14b24e2bSVaishali Kulkarni\end{center}
1390*14b24e2bSVaishali Kulkarni
1391*14b24e2bSVaishali Kulkarni\subsubsection{event callbacks related}
1392*14b24e2bSVaishali KulkarniThe callback received in listen call will be called with the following values after MPA response was acked by network/peer: \newline
1393*14b24e2bSVaishali Kulkarni\begin{tabular}{| l | p{10cm} |}
1394*14b24e2bSVaishali Kulkarni	\hline
1395*14b24e2bSVaishali Kulkarni	\textbf{Field} & \textbf{Value} \\ \hline
1396*14b24e2bSVaishali Kulkarni	\texttt{event} & \texttt{ECORE\_IWARP\_EVENT\_PASSIVE\_COMPLETE} \\ \hline
1397*14b24e2bSVaishali Kulkarni	\texttt{cm\_info} & \ref{sec:cminfo} with values negotiated. Dont care incase of mpa\_reject \\ \hline
1398*14b24e2bSVaishali Kulkarni	\texttt{ep\_context} & Dont care for this event. \\ \hline
1399*14b24e2bSVaishali Kulkarni	\texttt{status} & ECORE\_SUCCESS if connection establishment was successful. ECORE\_CONN\_RESET if connection establishment failed for any reason. ECORE\_CONNECTION\_REFUSED if mpa\_reject was called on the connection\\ \hline
1400*14b24e2bSVaishali Kulkarni\end{tabular}
1401*14b24e2bSVaishali Kulkarni
1402*14b24e2bSVaishali Kulkarni\subsubsection{ecore\_iwarp\_destroy\_listen}
1403*14b24e2bSVaishali KulkarniThis function will remove socket local and remote addresses (port, ip and vlan) from its listening database.
1404*14b24e2bSVaishali Kulkarni
1405*14b24e2bSVaishali Kulkarni
1406*14b24e2bSVaishali Kulkarni\subsection{Connection Teardown}
1407*14b24e2bSVaishali Kulkarni\label{sec:iwarp_teardown}
1408*14b24e2bSVaishali Kulkarni
1409*14b24e2bSVaishali Kulkarni\begin{figure}[h]
1410*14b24e2bSVaishali Kulkarni	\centering
1411*14b24e2bSVaishali Kulkarni	\includegraphics[scale=0.3]{iwarp_sm}
1412*14b24e2bSVaishali Kulkarni	\caption{iwarp state machine form hilland verbs}
1413*14b24e2bSVaishali Kulkarni	\label{fig:iwarp_sm}
1414*14b24e2bSVaishali Kulkarni\end{figure}
1415*14b24e2bSVaishali Kulkarni
1416*14b24e2bSVaishali KulkarniConnection teardown is performed via the modify\_qp verb according to the hilland verbs state machine.\ref{fig:iwarp_sm}
1417*14b24e2bSVaishali KulkarniThe interface into ecore is done with the states of RoCE and translated internally to iwarp states. This was done
1418*14b24e2bSVaishali Kulkarnito utilize the same interface for RoCE and iWARP. However, in the future this may be changes so that state translation
1419*14b24e2bSVaishali Kulkarniis done in the upper layer driver. Translation between the states is done as follows: \newline
1420*14b24e2bSVaishali Kulkarni\begin{tabular}{| l | p{10cm} |}
1421*14b24e2bSVaishali Kulkarni	\hline
1422*14b24e2bSVaishali Kulkarni	\textbf{RoCE State} & \textbf{iWARP State} \\ \hline
1423*14b24e2bSVaishali Kulkarni	ECORE\_ROCE\_QP\_STATE\_RESET & ECORE\_IWARP\_QP\_STATE\_IDLE \\ \hline
1424*14b24e2bSVaishali Kulkarni	ECORE\_ROCE\_QP\_STATE\_INIT & ECORE\_IWARP\_QP\_STATE\_IDLE \\ \hline
1425*14b24e2bSVaishali Kulkarni	ECORE\_ROCE\_QP\_STATE\_RTR & ECORE\_IWARP\_QP\_STATE\_IDLE \\ \hline
1426*14b24e2bSVaishali Kulkarni	ECORE\_ROCE\_QP\_STATE\_RTS & ECORE\_IWARP\_QP\_STATE\_RTS \\ \hline
1427*14b24e2bSVaishali Kulkarni	ECORE\_ROCE\_QP\_STATE\_SQD & ECORE\_IWARP\_QP\_STATE\_CLOSING \\ \hline
1428*14b24e2bSVaishali Kulkarni	ECORE\_ROCE\_QP\_STATE\_ERR & ECORE\_IWARP\_QP\_STATE\_ERROR \\ \hline
1429*14b24e2bSVaishali Kulkarni	ECORE\_ROCE\_QP\_STATE\_SQE & ECORE\_IWARP\_QP\_STATE\_TERMINATE \\ \hline
1430*14b24e2bSVaishali Kulkarni\end{tabular}
1431*14b24e2bSVaishali Kulkarni
1432*14b24e2bSVaishali Kulkarni\subsection{Active side connection Teardown}
1433*14b24e2bSVaishali Kulkarni\subsubsection{Graceful disconnect}
1434*14b24e2bSVaishali KulkarniTo initiate a graceful disconnect sequence, the active side will perform a modify\_qp to ECORE\_ROCE\_QP\_STATE\_SQD. This will be translated to ECORE\_IWARP\_QP\_STATE\_CLOSING and initiate a graceful teardown sequence with FW. Currently, due to existing FW implementation a modify qp to error will be sent fo FW before closing the connection. In the future, FW HSI will be changed so that a CLOSING state is added to FW as well. Once the disconnect is complete, whether gracefully or abortively ( in some cases a graceful disconnect will turn into an abortive one, timeouts, errors in close etc... ) an ECORE\_IWARP\_EVENT\_CLOSE event will be sent to upper layer driver. Ecore will transition to ERROR state in any case at the end of the flow.
1435*14b24e2bSVaishali Kulkarni
1436*14b24e2bSVaishali Kulkarni\subsubsection{Abortive disconnect}
1437*14b24e2bSVaishali KulkarniTo initiate an abortive disconnect sequence, the active side will perform a modify\_qp to ECORE\_ROCE\_QP\_STATE\_ERR. This will be translated to ECORE\_IWARP\_QP\_STATE\_ERROR and initiate an abortive teardown sequence with FW. Once the disconnect is completed, an ECORE\_IWARP\_EVENT\_CLOSE event will be sent to upper layer driver. Ecore will transition to ERROR state in any case at the end of the flow.
1438*14b24e2bSVaishali Kulkarni
1439*14b24e2bSVaishali Kulkarni\subsubsection{event callbacks related}
1440*14b24e2bSVaishali KulkarniThe callback received in connect / accept call will be called with the following values after disconnect has completed: \newline
1441*14b24e2bSVaishali Kulkarni\begin{tabular}{| l | p{10cm} |}
1442*14b24e2bSVaishali Kulkarni	\hline
1443*14b24e2bSVaishali Kulkarni	\textbf{Field} & \textbf{Value} \\ \hline
1444*14b24e2bSVaishali Kulkarni	\texttt{event} & \texttt{ECORE\_IWARP\_EVENT\_CLOSE} \\ \hline
1445*14b24e2bSVaishali Kulkarni	\texttt{cm\_info} & Dont care. \\ \hline
1446*14b24e2bSVaishali Kulkarni	\texttt{ep\_context} & Dont care for this event. \\ \hline
1447*14b24e2bSVaishali Kulkarni	\texttt{status} & ECORE\_SUCCESS if connection was terminated gracefully ( fin ) was successful. ECORE\_CONN\_RESET if connection was terminated abortively (RST) for any reason. \\ \hline
1448*14b24e2bSVaishali Kulkarni\end{tabular}
1449*14b24e2bSVaishali Kulkarni
1450*14b24e2bSVaishali Kulkarni\subsection{Passive side connection Teardown}
1451*14b24e2bSVaishali KulkarniOn passive side teardown sequence is initiated once a graceful / abortive request is received from peer. In this case ecore will send a ECORE\_IWARP\_EVENT\_DISCONNECT to upper layer driver. Ecore will transition to ERROR state in any case at the end of the flow.
1452*14b24e2bSVaishali Kulkarni
1453*14b24e2bSVaishali Kulkarni\subsubsection{event callbacks related}
1454*14b24e2bSVaishali KulkarniThe callback received in connect / accept call will be called with the following values once close request was received from peer: \newline
1455*14b24e2bSVaishali Kulkarni\begin{tabular}{| l | p{10cm} |}
1456*14b24e2bSVaishali Kulkarni	\hline
1457*14b24e2bSVaishali Kulkarni	\textbf{Field} & \textbf{Value} \\ \hline
1458*14b24e2bSVaishali Kulkarni	\texttt{event} & \texttt{ECORE\_IWARP\_EVENT\_DISCONNECT} \\ \hline
1459*14b24e2bSVaishali Kulkarni	\texttt{cm\_info} & Dont care. \\ \hline
1460*14b24e2bSVaishali Kulkarni	\texttt{ep\_context} & Dont care for this event. \\ \hline
1461*14b24e2bSVaishali Kulkarni	\texttt{status} & ECORE\_SUCCESS if graceful disconnect was received. ECORE\_CONN\_RESET if abortive disconnect was received. \\ \hline
1462*14b24e2bSVaishali Kulkarni\end{tabular}
1463*14b24e2bSVaishali Kulkarni\\
1464*14b24e2bSVaishali KulkarniEcore will continue the disconnect flow against FW without any additional requests from upper layer driver. Ecore will call upper layer driver with the following values after disconnect has completed: \newline
1465*14b24e2bSVaishali Kulkarni\begin{tabular}{| l | p{10cm} |}
1466*14b24e2bSVaishali Kulkarni	\hline
1467*14b24e2bSVaishali Kulkarni	\textbf{Field} & \textbf{Value} \\ \hline
1468*14b24e2bSVaishali Kulkarni	\texttt{event} & \texttt{ECORE\_IWARP\_EVENT\_CLOSE} \\ \hline
1469*14b24e2bSVaishali Kulkarni	\texttt{cm\_info} & Dont care. \\ \hline
1470*14b24e2bSVaishali Kulkarni	\texttt{ep\_context} & Dont care for this event. \\ \hline
1471*14b24e2bSVaishali Kulkarni	\texttt{status} & ECORE\_SUCCESS if connection was terminated gracefully ( fin ) was successful. ECORE\_CONN\_RESET if connection was terminated abortively (RST) for any reason. \\ \hline
1472*14b24e2bSVaishali Kulkarni\end{tabular}
1473*14b24e2bSVaishali Kulkarni
1474*14b24e2bSVaishali Kulkarni\section{IB verb implementation}
1475*14b24e2bSVaishali Kulkarni
1476*14b24e2bSVaishali Kulkarni\silentfunc{rdma_alloc_pd}
1477*14b24e2bSVaishali Kulkarni\silentfunc{rdma_alloc_tid}
1478*14b24e2bSVaishali Kulkarni\silentfunc{rdma_create_qp}
1479*14b24e2bSVaishali Kulkarni
1480*14b24e2bSVaishali Kulkarni\begin{itemize}
1481*14b24e2bSVaishali Kulkarni	\item \myfunc{rdma\_alloc\_pd}{rdma_alloc_pd} -- This function allocates a unique protection domain id. The id is returned in the out parameter pd. (verb: Allocate Protection Domain).
1482*14b24e2bSVaishali Kulkarni	\item \myfunc{rdma\_alloc\_tid}{rdma_alloc_tid} -- this function allocates a unique tid (task id). The id is returned in the out parameter itid.This function also allocates required memory in the ilt array (Host memory used for hw purposes).
1483*14b24e2bSVaishali Kulkarni	\item \myfunc{rdma\_create\_qp}{rdma_create_qp} -- This function will create the qp object in ecore and for iWARP in FW. In RoCE no FW ramrods are sent during the call to this function. The main change from existing create\_qp function, for iWARP is that instead of providing addresses to rq, sq separately, and allocating memory for FW queues in ecore, FW requires contiguous memory for the the pbl of all FW queues (RQ, SQ, ORQ, IRQ, HQ). Therefore interface will change, and instead of upper layer driver providing pbl address to create\_qp, these will be provided as out\_parameters after being allocated in ecore. Upper layer driver will be required to pass the number of pages required for SQ / RQ. Populating the pbls will be done after calling create\_qp and not before as done today. For ease of code sharing between iWARP and RoCE FW will modify RoCE implementation to work the same as iWARP.
1484*14b24e2bSVaishali Kulkarni	\item \myfunc{rdma\_modify\_qp}{rdma_modify_qp} -- The API will remain the same, however, for iWARP not all fields are relevant. Naming convention of RDMA/iWARP/RoCE was done in ecore\_roce\_api to distinguish between what is required and what is not. Modify QP is used in iWARP for part of the teardown flow detailed in \ref{sec:iwarp_teardown}
1485*14b24e2bSVaishali Kulkarni\end{itemize}
1486*14b24e2bSVaishali Kulkarni
1487*14b24e2bSVaishali Kulkarni\section{IWARP APP TLV configuration}
1488*14b24e2bSVaishali KulkarniEcore client has the ability to signal ecore that a specific tcp port in app tlv should be recognized as pertaining to the iwarp offloaded connections. If an app tlv which matches this port is indicated by MFW, all offloaded iwarp traffic of the PF will abide by this configuration (regardless of the actual tcp port of the offloaded connections). The app tlv can be set by the ecore client via the regular APIs for setting "locally administered params”. Ecore client communicates the tcp port value via \texttt{rdma\_pf\_params} structure, the value needs to be populated before invoking  \myfunc{resc\_alloc}{resc_alloc}. To configure the iwarp app tlv in the locally administered dcbx parameters, ecore client need to use the Dcbx APIs described in "Dcbx Interface" section. The relevant APIs are \myfunc{dcbx\_get\_config\_params}{dcbx_get_config_params} and \myfunc{dcbx\_config\_params}{dcbx_config_params}.
1489*14b24e2bSVaishali Kulkarni
1490*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
1491*14b24e2bSVaishali Kulkarni
1492*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
1493*14b24e2bSVaishali Kulkarni\chapter{LL2 (Light L2)}
1494*14b24e2bSVaishali Kulkarni\label{cha:ll2}
1495*14b24e2bSVaishali Kulkarni
1496*14b24e2bSVaishali KulkarniThis chapter describes the ecore interface for LL2 (Light L2). \\
1497*14b24e2bSVaishali KulkarniThe LL2 is a simplified version of L2 for which both slowpath and fastpath flows reside in ecore, and it is being used by the upper-layer drivers of the storage protocols.
1498*14b24e2bSVaishali Kulkarni
1499*14b24e2bSVaishali Kulkarni\section{Start LL2 connection}
1500*14b24e2bSVaishali Kulkarni\silentfunc{ll2_acquire_connection}
1501*14b24e2bSVaishali Kulkarni\silentfunc{ll2_establish_connection}
1502*14b24e2bSVaishali Kulkarni\begin{itemize}
1503*14b24e2bSVaishali Kulkarni	\item \myfunc{ll2\_acquire\_connection}{ll2_acquire_connection} -- this function allocates the resources for the LL2 connection. ll2\_acquire\_data structure that is received in this function contains the following parameters:
1504*14b24e2bSVaishali Kulkarni
1505*14b24e2bSVaishali Kulkarni	 \begin{tabular}{| l | p{10cm} |}
1506*14b24e2bSVaishali Kulkarni	 	\hline
1507*14b24e2bSVaishali Kulkarni	 	\textbf{Param} & \textbf{Description} \\ \hline
1508*14b24e2bSVaishali Kulkarni	 	\texttt{conn\_type} & should be set according to the protocol. \\ \hline
1509*14b24e2bSVaishali Kulkarni	 	\texttt{mtu} & Maximum bytes that can be placed on a BD. \\ \hline
1510*14b24e2bSVaishali Kulkarni	 	\texttt{rx\_num\_desc} & maximal number of entries in the Rx ring. \\ \hline
1511*14b24e2bSVaishali Kulkarni	 	\texttt{tx\_num\_desc} & maximal number of entries in the Tx ring (each packet-buffer occupies an entry). \\ \hline
1512*14b24e2bSVaishali Kulkarni	 	\texttt{rx\_num\_ooo\_buffers} & Relevant only for OOO connection if 0 default value of 2*rx\_num\_desc will be used). \\ \hline
1513*14b24e2bSVaishali Kulkarni	 	\texttt{rx\_drop\_ttl0\_flg} & can be set. \\ \hline
1514*14b24e2bSVaishali Kulkarni	 	\texttt{rx\_vlan\_removal\_en} & can be set if it is desired to get the VLAN stripped and out-of-band. \\ \hline
1515*14b24e2bSVaishali Kulkarni	 	\texttt{tx\_tc} & tx traffic class. 0 - regular tc, or for loopback use PURE\_LB\_TC or PKT\_LB\_TC for the rest. \\ \hline
1516*14b24e2bSVaishali Kulkarni	 	\texttt{tx\_dest} & Destination of tx -> Network or Loopback. \\ \hline
1517*14b24e2bSVaishali Kulkarni	 	\texttt{ai\_err\_packet\_too\_big} & How FW should handle packet too big error: (DROP, NOTHING, ASSERT). \\ \hline
1518*14b24e2bSVaishali Kulkarni	 	\texttt{ai\_err\_no\_buf} & How FW should handle no buffers error: (DROP, NOTHING, ASSERT). \\ \hline
1519*14b24e2bSVaishali Kulkarni	 	\texttt{gsi\_enable} & Relevant for RoCE only - is the ll2 intended to work with GSI Offload or not. \\ \hline
1520*14b24e2bSVaishali Kulkarni	 	\texttt{p\_connection\_handle} & Output parameter contains a handle which is used in future calls related to this LL2 connection. \\ \hline
1521*14b24e2bSVaishali Kulkarni	 	\texttt{cbs} & Callback functions that should be called on completion or release of rx / tx packets. \\ \hline
1522*14b24e2bSVaishali Kulkarni	 \end{tabular}
1523*14b24e2bSVaishali Kulkarni
1524*14b24e2bSVaishali Kulkarni
1525*14b24e2bSVaishali Kulkarni	\item \myfunc{ll2\_establish\_connection}{ll2_establish_connection} -- this function offloads the LL2 connection to the device (both Tx and Rx).
1526*14b24e2bSVaishali Kulkarni	\item After establishing the connection, it is possible to post Rx buffers and to send Tx packets.
1527*14b24e2bSVaishali Kulkarni\end{itemize}
1528*14b24e2bSVaishali Kulkarni
1529*14b24e2bSVaishali Kulkarni\section{Receive LL2 packets}
1530*14b24e2bSVaishali Kulkarni\silentfunc{ll2_post_rx_buffer}
1531*14b24e2bSVaishali Kulkarni\begin{itemize}
1532*14b24e2bSVaishali Kulkarni	\item \myfunc{ll2\_post\_rx\_buffer}{ll2_post_rx_buffer} -- this function adds the provided buffer to the receive ring. The buffer size should be at least mtu (as provided during connection start) + maximum Ethernet header size + cache line size + 4 (cache line size is typically 64 byte). \texttt{notify\_fw} should be set. \texttt{addr} should be a DMA-mapped address.
1533*14b24e2bSVaishali Kulkarni	\subsection{Related callback functions ( received in acquire\_connection)}
1534*14b24e2bSVaishali Kulkarni	\item \texttt{complete\_rx\_packet} -- this is a callback function that should be implemented in the upper driver. Ecore calls this function when a packet is received and written to a buffer in the Rx ring. \texttt{cookie} and \texttt{rx\_buf\_addr} are echoed from the call that posted that buffer. \texttt{placement\_offset} is the offset in bytes in the buffer, starting from which the packet was written. \texttt{packet\_length} is the total packet length in bytes. \texttt{opaque\_data\_0/1} and \texttt{b\_last\_packet} can be ignored. \texttt{vlan} is the VLAN tag stripped from the packet, and it is valid only if PARSING\_AND\_ERR\_FLAGS\_TAG8021QEXIST bit is set in \texttt{parse\_flags}. \texttt{parse\_flags} field contains additional flags which are mostly not interesting for the upper driver.
1535*14b24e2bSVaishali Kulkarni	\item \texttt{release\_rx\_packet} -- this is a callback function that should be implemented in the upper driver. Ecore calls this function when the connection is terminated and there are still buffers in the Rx ring. In this case it will call this function per each buffer, so the upper driver can free those buffers.
1536*14b24e2bSVaishali Kulkarni\end{itemize}
1537*14b24e2bSVaishali Kulkarni
1538*14b24e2bSVaishali Kulkarni\section{Transmit LL2 packets}
1539*14b24e2bSVaishali Kulkarni\silentfunc{ll2_prepare_tx_packet}
1540*14b24e2bSVaishali Kulkarni\silentfunc{ll2_set_fragment_of_tx_packet}
1541*14b24e2bSVaishali Kulkarni\begin{itemize}
1542*14b24e2bSVaishali Kulkarni	\item \myfunc{ll2\_prepare\_tx\_packet}{ll2_prepare_tx_packet} -- this function adds a new packet to the transmit ring. If the packet is composed from more than a single buffer, than the address and length of the additional buffers is provided to ecore by calling \texttt{ecore\_ll2\_set\_fragment\_of\_tx\_packet} for each additional buffer. \\
1543*14b24e2bSVaishali Kulkarni	\texttt{num\_of\_bds} is the number of buffers that compose the packet (including the first buffer), and is limited to CORE\_LL2\_TX\_LOCAL\_RING\_SIZE.
1544*14b24e2bSVaishali Kulkarni	\texttt{first\_frag} should be a DMA-mapped address, and \texttt{first\_frag\_len} is the buffer length in bytes. \texttt{vlan} is the VLAN tag to insert in the packet (if desired), and in this case CORE\_TX\_BD\_FLAGS\_VLAN\_INSERTION flag in \texttt{bd\_flags} should be set. \\
1545*14b24e2bSVaishali Kulkarni	For IP checksum and L4 checksum offload, CORE\_TX\_BD\_FLAGS\_IP\_CSUM and CORE\_TX\_BD\_FLAGS\_L4\_CSUM flags in \texttt{bd\_flags} should be set. \texttt{notify\_fw} should be set.
1546*14b24e2bSVaishali Kulkarni	\item \myfunc{ll2\_set\_fragment\_of\_tx\_packet}{ll2_set_fragment_of_tx_packet} -- this function provides the next buffer of a packet. \texttt{addr} should be a DMA-mapped address, and \texttt{nbytes} is the buffer length in bytes.
1547*14b24e2bSVaishali Kulkarni	\subsection{Related callback functions ( received in acquire\_connection)}
1548*14b24e2bSVaishali Kulkarni	\item \texttt{complete\_tx\_packet} -- this is a callback function that should be implemented in the upper driver. Ecore calls this function when the transmission of the packet is completed (it is called once per-packet). \texttt{cookie} and \texttt{first\_frag\_addr} are echoed from the call that posted that first fragment of the packet. \texttt{b\_last\_fragment} and \texttt{b\_last\_packet} can be ignored.
1549*14b24e2bSVaishali Kulkarni	\item \texttt{release\_tx\_packet} -- this is a callback function that should be implemented in the upper driver. Ecore calls this function when the connection is terminated and there are still packets in the Tx ring. In this case it will call this function per each packet, so the upper driver can free the associated buffers.
1550*14b24e2bSVaishali Kulkarni\end{itemize}
1551*14b24e2bSVaishali Kulkarni
1552*14b24e2bSVaishali Kulkarni\section{Stop LL2 connection}
1553*14b24e2bSVaishali Kulkarni\silentfunc{ll2_terminate_connection}
1554*14b24e2bSVaishali Kulkarni\silentfunc{ll2_release_connection}
1555*14b24e2bSVaishali Kulkarni\begin{itemize}
1556*14b24e2bSVaishali Kulkarni	\item \myfunc{ll2\_terminate\_connection}{ll2_terminate_connection} -- this function removes the LL2 connection from the device. When this function is called, ecore checks for non-completed Tx packet / Rx buffers, and calls the \texttt{release\_tx\_packet()} and / \texttt{release\_rx\_packet()} callback functions respectively.
1557*14b24e2bSVaishali Kulkarni	\item \myfunc{ll2\_release\_connection}{ll2_release_connection} -- this function releases the resources for the LL2 connection.
1558*14b24e2bSVaishali Kulkarni\end{itemize}
1559*14b24e2bSVaishali Kulkarni
1560*14b24e2bSVaishali Kulkarni\section{Getting statistics}
1561*14b24e2bSVaishali Kulkarni\myfunc{ll2\_get\_stats}{ll2_get_stats} can be used to query the device for various ll2-related statistics.
1562*14b24e2bSVaishali Kulkarni
1563*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
1564*14b24e2bSVaishali Kulkarni
1565*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
1566*14b24e2bSVaishali Kulkarni\chapter{Single Root I/O Virtualization}
1567*14b24e2bSVaishali Kulkarni\label{cha:sriov}
1568*14b24e2bSVaishali Kulkarni
1569*14b24e2bSVaishali Kulkarni\myindex{SRIOV} is a PCIe functionality which allows Physical functions (also termed \myindex{PF}s) to spawn Virtual functions (also termed \myindex{VF}s), with a limited set of registers in their PCI configuration space and bars, but that should supply ~the same basic functionality
1570*14b24e2bSVaishali Kulkarni
1571*14b24e2bSVaishali KulkarniSR-IOV handling is performed by the ecore on the hypervisor as well as the ecore on the VM.
1572*14b24e2bSVaishali KulkarniThese work hand in hand (either via HW-channel or through SW-channel) to configure the VF device for SR-IOV.
1573*14b24e2bSVaishali KulkarniWith some exceptions, it could be said that the upper driver doesn’t need to be aware of driving a VF instead of a PF.
1574*14b24e2bSVaishali KulkarniWhere on the PF side upper-layer driver accesses the ecore to send a ramrod on its behalf or perform a configuration,
1575*14b24e2bSVaishali Kulkarnion the VF side the upper driver will use the same API to access it’s ecore, which will in turn communicate with the PF’s ecore via the channel to perform the equivalent configuration.
1576*14b24e2bSVaishali KulkarniThis is an abstraction, and there are quite a few reservations and exceptions, but that is the working model.
1577*14b24e2bSVaishali Kulkarni
1578*14b24e2bSVaishali KulkarniSections \ref{sec:sriov-hw-channel}, \ref{sec:sriov-tlv} mostly give a glimpse of the mechanism used by the ecore to support the feature, while the rest of the sections are of more interest to the upper-driver implementer since they contain the howtos.
1579*14b24e2bSVaishali Kulkarni
1580*14b24e2bSVaishali KulkarniSome relevant documents are \cite{doc:iov-lec}, \cite{doc:iov-sys} and \cite{doc:iov-doc}.
1581*14b24e2bSVaishali Kulkarni
1582*14b24e2bSVaishali Kulkarni\section{IOV-related fields and terminology}
1583*14b24e2bSVaishali KulkarniThe \textit{ecore\_dev} contains an \textit{sriov\_info} field, which is filled very early during initialization (inside \textit{ecore\_hw\_prepare()} according to pci configuration space sriov capability. Later on, this struct is read-only by the ecore.
1584*14b24e2bSVaishali KulkarniUpper driver can read values in this struct [instead of accessing PCI configuration space] if needed,
1585*14b24e2bSVaishali Kulkarnibut there are is a single field it 'owns', b\_hw\_channel -- In most distros VFs will communicate with PFs using the HW-channel [see section \ref{sec:sriov-hw-channel}], and upper-driver should set it to `true'. However, if upper-driver utilizes a designated SW-channel which it can use instead of the HW-channel, it should set let this field remain 'false'. \\
1586*14b24e2bSVaishali Kulkarni
1587*14b24e2bSVaishali KulkarniAn additional important field is the \myindex{total\_vfs} which represents the maximal number of VFs current PF can possibly have. The macro \myindex{IS\_ECORE\_SRIOV} can be used to determined if PF has $\text{total\_vfs} > 0$, therefore whether IOV is relevant to the PF or not. \\
1588*14b24e2bSVaishali Kulkarni
1589*14b24e2bSVaishali KulkarniImportant terminology when talking about VFs is \myindex{relative\_vfid} versus \myindex{absolute\_vfid}. The relative vfid is the zero-based index of the VF relative to its parent PF, i.e., the first VF of a given PF is always 0, second is 1, etc.
1590*14b24e2bSVaishali KulkarniThe absolute vfid is the zero-based index of the VF relate to all the VFs on the same path, i.e., it's possible the first PF of a given VF will have an absolute vfid which is greater than zero.
1591*14b24e2bSVaishali Kulkarni
1592*14b24e2bSVaishali KulkarniFor most upper-driver uses, the relative vfid is the interesting index. Ecore sometimes needs to use the absolute value for configuring the FW/HW.
1593*14b24e2bSVaishali Kulkarni
1594*14b24e2bSVaishali Kulkarni\section{Initializing and Closing VFs}
1595*14b24e2bSVaishali KulkarniWhen a PF is about to initialize its VFs, it should enable the access of each VF to the HW by calling \myfunc{iov\_init\_hw\_for\_vf}{iov_init_hw_for_vf} for each VF [passing its relative vfid].
1596*14b24e2bSVaishali Kulkarni\begin{NOTICE}
1597*14b24e2bSVaishali Kulkarni	At this point upper-layer driver has to know the number of interrupts to assign to each VF, since the MSIX table in each VF configuration space must be arranged by ecore prior to VF activation.
1598*14b24e2bSVaishali Kulkarni\end{NOTICE}
1599*14b24e2bSVaishali Kulkarni
1600*14b24e2bSVaishali KulkarniFollowing this, upper-driver can initiate the sequence [usually via OS api] that would enable the VFs and cause them to be probed.
1601*14b24e2bSVaishali KulkarniAfterwards, upper-driver can initialize the VF same as it would have the PF, i.e., the difference in initialization logic is 'hidden' inside the ecore. Upper-layer code doesn't need to contain all sorts of if-else clauses to differentiate between VF and PF [at least, not as far as the ecore initialization is concerned.
1602*14b24e2bSVaishali Kulkarni
1603*14b24e2bSVaishali KulkarniClosing the VF should operate smoothly without need of any special calls. I.e., regular closure sequence for PFs should be translated by ecore opaquely into a sequence closing the VF.
1604*14b24e2bSVaishali Kulkarni
1605*14b24e2bSVaishali KulkarniThe only 'special' effort that should be taken is that after all the flow is done the PF's upper-driver should call \myfunc{iov\_release\_hw\_for\_vf}{iov_release_hw_for_vf}. After doing this, VF can only be re-activated by re-calling \textit{ecore\_iov\_init\_hw\_for\_vf()}.
1606*14b24e2bSVaishali Kulkarni
1607*14b24e2bSVaishali KulkarniIf the upper-driver has the option, during sriov-disable prior to VFs been unset in the PF's PCI configuration space, upper-driver should call \myfunc{iov\_set\_vf\_to\_disable}{iov_set_vf_to_disable} for each one of its active VFs. This will result with a cleaner FW/HW after closure is complete.
1608*14b24e2bSVaishali Kulkarni
1609*14b24e2bSVaishali Kulkarni\section{Message passing from VF to PF}
1610*14b24e2bSVaishali Kulkarni\label{sec:sriov-tlv}
1611*14b24e2bSVaishali KulkarniThe VF's PCI bar is very different from the PF bar, and with much more limited access toward chip; see \cite{doc:iov-sys} for details about the VF bar. As a result, most of the slowpath configuration that needs to be done for the VF actually has to be done by the PF.
1612*14b24e2bSVaishali Kulkarni
1613*14b24e2bSVaishali KulkarniTo support this, there is a mechanism of \myindex{TLV}\footnote{Type-Length-Value} message passing from VF to PF, in which the VF can request the PF to either perform services for it or supply it with information which is otherwise inaccessible for the VF.
1614*14b24e2bSVaishali KulkarniThis message passing is usually done via the HW channel [see section \ref{sec:sriov-hw-channel}], but assuming the existence of an alternative method [i.e., SW-channel] it can be done via it just as well.
1615*14b24e2bSVaishali Kulkarni
1616*14b24e2bSVaishali Kulkarni\begin{exampleT}
1617*14b24e2bSVaishali Kulkarni	During \textit{ecore\_hw\_prepare()} ecore gathers information about the chip from various locations - HW, shared memory with Management FW, etc.. However, almost all of that information is inaccessible to the VF. Thus the VF has an alternative flow by which it sends an ACQUIRE message to the PF, notifying it that it's up and requesting information about the device - e.g., number of resources such as status blocks and queues available to the VF.
1618*14b24e2bSVaishali Kulkarni\end{exampleT}
1619*14b24e2bSVaishali Kulkarni
1620*14b24e2bSVaishali KulkarniNotice the upper-driver itself should never initiate such a message passing directly; All such message passing is 'hidden' inside the VF's ecore.
1621*14b24e2bSVaishali Kulkarni
1622*14b24e2bSVaishali Kulkarni\begin{exampleT}
1623*14b24e2bSVaishali Kulkarni	When a VF driver wants to start a vport, it calls \textit{ecore\_sp\_vport\_start()},
1624*14b24e2bSVaishali Kulkarni unaware that inside the ecore this will send a VPORT\_START TLV message from VF to PF, and that the PF will open the vport for the VF as a result.
1625*14b24e2bSVaishali Kulkarni\end{exampleT}
1626*14b24e2bSVaishali Kulkarni
1627*14b24e2bSVaishali Kulkarni\section{HW channel}
1628*14b24e2bSVaishali Kulkarni\label{sec:sriov-hw-channel}
1629*14b24e2bSVaishali KulkarniThis is how the HW-channel operates [High level]:
1630*14b24e2bSVaishali Kulkarni
1631*14b24e2bSVaishali Kulkarni\silentfunc{iov_process_mbx_req}
1632*14b24e2bSVaishali Kulkarni\silentfunc{iov_copy_vf_msg}
1633*14b24e2bSVaishali Kulkarni\begin{enumerate}
1634*14b24e2bSVaishali Kulkarni	\item Prior to VF activation, PF enables VF access to the HW-channel, which actually permits it to access its PCI bar.
1635*14b24e2bSVaishali Kulkarni	\item VF prepares a message on DMA-able memory, which also contains an address of an additional DMA-able memory upon which the VF will poll for the PF reply.
1636*14b24e2bSVaishali Kulkarni	\item VF writes a `trigger' containing the buffer's ghost physical address into the specific address in the USDM. It then polls until reply is received [or timeout is reached].
1637*14b24e2bSVaishali Kulkarni	\item This BAR access to the Ustorm RAM is trapped as an aggregated interrupt to and activates a handler in Storm FW.
1638*14b24e2bSVaishali Kulkarni	\item FW identifies the sending VF according to address and trigger's content and derives the parent PF's id. It then triggers an interrupt [event] on the PF, filling the event's cookie with buffer's address.
1639*14b24e2bSVaishali Kulkarni	\item PF driver's ISR wakes. It recognizes the message and calls OSAL\_PF\_VF\_MSG to notify upper-layer driver of the message; This is mostly since the slowpath context isn't the proper place to handle VF messages.
1640*14b24e2bSVaishali Kulkarni	\item Upper-layer driver should utilizes DMAE [\myfunc{iov\_copy\_vf\_msg}{iov_copy_vf_msg}] in order to copy the buffer from the VF's memory domain into it's own. [PF uses the VF's pci requestor-id for the DMAE transaction, to access the VM's Ghost Physical Address].
1641*14b24e2bSVaishali Kulkarni	Following that, it should schedule the proper context for handling the VF message calling \myfunc{iov\_process\_mbx\_req}{iov_process_mbx_req} to allow ecore to process the VF's message.
1642*14b24e2bSVaishali Kulkarni	\item ... PF processes the VF's request ...
1643*14b24e2bSVaishali Kulkarni	\item PF prepares an answer for the VF [success, failure, etc.] which might also contain information. It uses DMAE to copy this message to the VF's reply address [specified in the VF's message].
1644*14b24e2bSVaishali Kulkarni	\item VF wakes from the PF's message and processes the answer.
1645*14b24e2bSVaishali Kulkarni\end{enumerate}
1646*14b24e2bSVaishali Kulkarni
1647*14b24e2bSVaishali KulkarniOne optional auxilary function that can be used by the ecore-client is \myfunc{iov\_pf\_get\_pending\_events}{iov_pf_get_pending_events}. It will return a bitmask of all the VFs belonging to the PF for which there's a message yet to be processed.
1648*14b24e2bSVaishali Kulkarni
1649*14b24e2bSVaishali KulkarniNotice that the Hw-channel is one-pending, i.e., VF cannot send an additional message until PF has notified FW that it's done processing the message.
1650*14b24e2bSVaishali Kulkarni
1651*14b24e2bSVaishali Kulkarni\begin{NOTICE}
1652*14b24e2bSVaishali Kulkarni	If VF will try sending an additional message, FW will mark it as malicious.
1653*14b24e2bSVaishali Kulkarni\end{NOTICE}
1654*14b24e2bSVaishali Kulkarni
1655*14b24e2bSVaishali Kulkarni\section{Message Passing from PF to VF}
1656*14b24e2bSVaishali KulkarniThe Message passing from VF to PF benefits from the PF's slowpath status-block, i.e., the ability of the PF to receive slowpath interrupts.
1657*14b24e2bSVaishali KulkarniSince the VF hasn't got such a status block allocated for it, the message passing between the PF and the VF consists of polling on the VF side.
1658*14b24e2bSVaishali KulkarniThe VF has a buffer named \myindex{bulletin-board} into which the PF posts messages.
1659*14b24e2bSVaishali KulkarniThe VF would periodically poll this buffer for updates.
1660*14b24e2bSVaishali Kulkarni
1661*14b24e2bSVaishali Kulkarni\begin{exampleT}
1662*14b24e2bSVaishali Kulkarni	PF can use bulletin boards to notify VF of current link state. Notice that link state doesn't necessarily has to reflect the physical link state.
1663*14b24e2bSVaishali KulkarniE.g., Hypervisor tools might be used to configure VF link state as always up regardless of physical state, so that VFs could communicate using Tx-Switching.
1664*14b24e2bSVaishali Kulkarni\end{exampleT}
1665*14b24e2bSVaishali Kulkarni
1666*14b24e2bSVaishali KulkarniThe bulletin board periodic sampling is a policy that needs to be determined and done by the upper-layer driver. It's done by calling the API function
1667*14b24e2bSVaishali Kulkarni\myfunc{vf\_read\_bulletin}{vf_read_bulletin}, which checks for any changes.
1668*14b24e2bSVaishali KulkarniIf such a change occurs, since the bulletin doesn't contain deltas from previous messages but rather the entire data [due to lack of handshake the PF can't know if VF read previous bulletin boards], the upper-driver has a wide assortment of functions-per-feature which are defined in ecore\_vf\_api.h and can be used to learn of the current state. E.g., \myfunc{vf\_get\_link\_state}{vf_get_link_state},
1669*14b24e2bSVaishali Kulkarni\myfunc{vf\_get\_port\_mac}{vf_get_port_mac}.
1670*14b24e2bSVaishali Kulkarni
1671*14b24e2bSVaishali Kulkarni\subsection{How the bulletin board works}
1672*14b24e2bSVaishali Kulkarni
1673*14b24e2bSVaishali Kulkarni[Inner workings of the ecore; Not necessary for the upper-driver implementer]
1674*14b24e2bSVaishali Kulkarni
1675*14b24e2bSVaishali Kulkarni\begin{enumerate}
1676*14b24e2bSVaishali Kulkarni	\item During \textit{ecore\_hw\_prepare()} of the VF, ecore allocates a DMA-able buffer for the bulletin board.
1677*14b24e2bSVaishali Kulkarni	\item During the ACQUIRE message sent from VF to PF, VF posts the physical address of the bulletin board as well as its size.
1678*14b24e2bSVaishali Kulkarni	\item During the ACQUIRE response sent from PF to VF, PF agrees upon the size of bulletin board which will be used [forward-backward compatibility].
1679*14b24e2bSVaishali Kulkarni	\item Whenever any of the field of the bulletin fields the PF wants to post changes, PF increments a counter, calculate a CRC and uses DMAE to copy its local buffer into the VF's bulletin buffer.
1680*14b24e2bSVaishali Kulkarni	\item On the VF-side, the polled \textit{ecore\_vf\_read\_bulletin()} samples the buffer, verifies the CRC [to make sure it has a consistent image of the buffer] and if the bulletin index has increment since last seen get's updated according to the new bulletin board.
1681*14b24e2bSVaishali Kulkarni\end{enumerate}
1682*14b24e2bSVaishali Kulkarni
1683*14b24e2bSVaishali Kulkarni\section{Function Level Reset}
1684*14b24e2bSVaishali KulkarniPCI Function Level Reset [\myindex{FLR}] is the a functionality triggered by a write to a specific [standard] bit in the PCI function configuration space, which should result in the function being reset.
1685*14b24e2bSVaishali KulkarniOn many OSes this feature is used to reset VFs on certain occasions, such as their physical assignment and de-assignment from VMs.
1686*14b24e2bSVaishali KulkarniIn addition, FLR might be used internally by driver/FW in case of malicious VFs, where that VF's database should be cleared before re-enablement.
1687*14b24e2bSVaishali Kulkarni
1688*14b24e2bSVaishali Kulkarni\begin{NOTICE}
1689*14b24e2bSVaishali Kulkarni	At this time, malicious VF handling does not exist in the ecore.
1690*14b24e2bSVaishali Kulkarni\end{NOTICE}
1691*14b24e2bSVaishali Kulkarni
1692*14b24e2bSVaishali KulkarniThe FLR flow is a complicated flow which involves Management firmware, storm firmware and driver all working on cleaning the HW and their own databases
1693*14b24e2bSVaishali Kulkarni[See \cite{doc:iov-sys} for more details]. From driver point-of-view, management FW notifies driver of FLR after it and the storm FW have already done some work [storm FW done what's called `initial cleanup'].
1694*14b24e2bSVaishali KulkarniEcore handles the MFW messasge about FLR, and eventually notifies upper-layer driver via \myindex{OSAL\_VF\_FLR\_UPDATE} about the FLR.
1695*14b24e2bSVaishali Kulkarni
1696*14b24e2bSVaishali Kulkarni\begin{NOTICE}
1697*14b24e2bSVaishali Kulkarni	Again, this OSAL is mainly for allowing the upper-layer driver to move this run from the slowpath context into a different context.
1698*14b24e2bSVaishali Kulkarni\end{NOTICE}
1699*14b24e2bSVaishali Kulkarni
1700*14b24e2bSVaishali KulkarniUpper-layer driver should clean whatever non-ecore `volatile' information it holds for those VFs, and then call \myfunc{iov\_vf\_flr\_cleanup}{iov_vf_flr_cleanup}, which will continue the FLR process -- send a final cleanup ramrod to FW and notify MFW that the FLR process has been complete. Following this call, the FLRed VFs should be operational and in `clean slate' mode.
1701*14b24e2bSVaishali Kulkarni
1702*14b24e2bSVaishali Kulkarni\begin{NOTICE}
1703*14b24e2bSVaishali Kulkarni	Unless \textit{ecore\_iov\_set\_vf\_to\_disable()} was called, in which case following the FLR those VFs will be disabled in FW/HW.
1704*14b24e2bSVaishali Kulkarni\end{NOTICE}
1705*14b24e2bSVaishali Kulkarni
1706*14b24e2bSVaishali Kulkarni\section{versioning}
1707*14b24e2bSVaishali Kulkarni\subsection{Slowpath versions}
1708*14b24e2bSVaishali KulkarniSr-iov is exposed to complex versioning challenges. Specifically, a given PF driver may be working with VF drivers of older and/or newer versions at the same time.
1709*14b24e2bSVaishali KulkarniThis means that the channel and bulletin board must be forwards and backwards compatible. The Bulletin Board achieves this by only adding new fields.
1710*14b24e2bSVaishali KulkarniThe Channel achieves compatibility through a TLV interface. Messages will always contain a type, length, value header, and may have multiple such parts.
1711*14b24e2bSVaishali KulkarniThe receiver of a message (be it PF receiving a request or VF receiving a response) will parse the message, process the parts it is aware of and be able to skip over parts which it doesn't recognize.
1712*14b24e2bSVaishali KulkarniThis design allows to declare messages as obsolete, modify existing messages by adding/removing modular pieces, etc.
1713*14b24e2bSVaishali Kulkarni
1714*14b24e2bSVaishali Kulkarni\subsection{Fastpath versions}
1715*14b24e2bSVaishali KulkarniThe compatibility requirements of fastpath flows have to be lenient, as we can't afford to penalize the performance.
1716*14b24e2bSVaishali KulkarniIf the fastpath API changes in a non backward compatible fashion (assumed to be a rare occurrence) the VF will either fail to load, or else have to carry with it several alternate implementations for fastpath. The VF driver learns of the fastpath version from the slowpath interaction with the PF.
1717*14b24e2bSVaishali Kulkarni
1718*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
1719*14b24e2bSVaishali Kulkarni
1720*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
1721*14b24e2bSVaishali Kulkarni\chapter{Selftest}
1722*14b24e2bSVaishali Kulkarni\label{cha:Selftest}
1723*14b24e2bSVaishali Kulkarni
1724*14b24e2bSVaishali KulkarniThis chapter describes the ecore interfaces for selftests. The scope of the selftests is to sample various aspects of device functionality and verify that it is operational. It is not intended and does not lay claim to perform full coverage of any functionality. \\
1725*14b24e2bSVaishali Kulkarni
1726*14b24e2bSVaishali Kulkarni\section{Register Test}
1727*14b24e2bSVaishali Kulkarni	\myfunc{selftest\_register}{selftest_register} -- this test verifies the data integrity of the registers. It writes a predefined value to the register, reads it back and verifies that the contents are correctly saved. It saves the register original content before preforming the test and restores its value after the test. This test is performed via MFW and accesses registers from both engines as well as registers from engine common blocks.
1728*14b24e2bSVaishali Kulkarni
1729*14b24e2bSVaishali Kulkarni\section{Clock Test}
1730*14b24e2bSVaishali Kulkarni	\myfunc{selftest\_clock}{selftest_clock} -- it measures the clock frequencies of the E4 modules. The clocks verified in this test are,
1731*14b24e2bSVaishali Kulkarni	\begin{itemize}
1732*14b24e2bSVaishali Kulkarni		\item Main clock frequency
1733*14b24e2bSVaishali Kulkarni		\item STORM clock frequency
1734*14b24e2bSVaishali Kulkarni		\item NW clock frequency
1735*14b24e2bSVaishali Kulkarni	\end{itemize}
1736*14b24e2bSVaishali Kulkarni
1737*14b24e2bSVaishali Kulkarni\section{Interrupt Test}
1738*14b24e2bSVaishali Kulkarni	\myfunc{selftest\_interrupt}{selftest_interrupt} -- this test verifies the interrupt path. Ecore employs its most basic flow which exercises interrupts, the heartbeat ramrod. Ramrod is sent and interrupt is received.
1739*14b24e2bSVaishali Kulkarni
1740*14b24e2bSVaishali Kulkarni\section{Memory Test}
1741*14b24e2bSVaishali Kulkarni	\myfunc{selftest\_memory}{selftest_memory} -- this test samples some of the memories in the device. Ecore employs its most basic flow which exercises memories, again the heartbeat ramrod. In this flow context is loaded to the context manager memory and is verified by the storm FW (otherwise the ramrod would fail).
1742*14b24e2bSVaishali Kulkarni
1743*14b24e2bSVaishali Kulkarni\section{NVRAM Test}
1744*14b24e2bSVaishali Kulkarni	\myfunc{selftest\_nvram}{selftest_nvram} -- this performs the nvram test. It loops through all the nvram partitions, reads the image on the partition and validates its crc.
1745*14b24e2bSVaishali Kulkarni
1746*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
1747*14b24e2bSVaishali Kulkarni
1748*14b24e2bSVaishali Kulkarni\chapterimage{qlogic-full-36}
1749*14b24e2bSVaishali Kulkarni\chapter{Precision Time Protocol (PTP) support}
1750*14b24e2bSVaishali Kulkarni\label{cha:ptp}
1751*14b24e2bSVaishali KulkarniThis chapter provides an high level overview of PTP and describes the ecore interfaces for the same. PTP also known as Time Sync allows the synchronization of the clocks in the distributed systems. The protocol selects one clock in the network as master clock and all other clocks (slave clocks) synchronizes their clocks with the master. Driver's responsibilities include enable/disable of the PTP feature on the device, register/un-register of the hardware clock and its operations to the OS and configure the required Rx/Tx PTP filters. HW/FW does the timestamping of Tx/Rx PTP packets, driver need to read these timestamp values and present it to upper layer protocols (e.g., IPv4). Rx timestamping will be available during the Rx interrupt processing of the driver. FW does the Tx timestamping when first byte of the PTP packet is placed on the wire, driver has to poll for the availability of this timestamp value when processing the PTP Tx packet. \\
1752*14b24e2bSVaishali Kulkarni\section{Ecore APIs}
1753*14b24e2bSVaishali KulkarniTo enable PTP support, ecore-client should call \myfunc{ptp\_enable}{ptp_enable} and then configure the required PTP filters which include
1754*14b24e2bSVaishali Kulkarnienabling the Tx timestamping using \myfunc{ptp\_hwtstamp\_tx\_on}{ptp_hwtstamp_tx_on} and configuring the Rx filter mode
1755*14b24e2bSVaishali Kulkarniusing \myfunc{ptp\_cfg\_rx\_filters}{ptp_cfg_rx_filters} API.
1756*14b24e2bSVaishali KulkarniRx filter mode instructs the device to trace the configured Rx PTP packets such as L2, IPv4 etc.
1757*14b24e2bSVaishali KulkarniRx/Tx timestamp values can be read using the APIs \myfunc{ptp\_read\_rx\_ts}{ptp_read_rx_ts} and
1758*14b24e2bSVaishali Kulkarni\myfunc{ptp\_read\_tx\_ts}{ptp_read_tx_ts} respectively.
1759*14b24e2bSVaishali KulkarniThe API \myfunc{ptp\_read\_cc}{ptp_read_cc} can be used to read the Phy hardware clock and
1760*14b24e2bSVaishali Kulkarnithe API \myfunc{ptp\_adjfreq}{ptp_adjfreq} provides implementation for adjusting the hardware clock by a rate given in parts-per-billion (ppb) units. \\
1761*14b24e2bSVaishali KulkarniAs part of feature clean up, ecore client should call \myfunc{ptp\_disable}{ptp_disable} API to disable the PTP feature on the hardware. \\
1762*14b24e2bSVaishali Kulkarni\begin{NOTICE}
1763*14b24e2bSVaishali KulkarniIt is the driver's responsibility to read the Rx/Tx timestamp values. The timestamp register will not be freed for next PTP packets until current value is read by the driver.
1764*14b24e2bSVaishali Kulkarni\end{NOTICE}
1765*14b24e2bSVaishali Kulkarni
1766*14b24e2bSVaishali Kulkarni\SpillChapterFuncs
1767*14b24e2bSVaishali Kulkarni
1768*14b24e2bSVaishali Kulkarni%\chapterimage{qlogic-full-36}
1769*14b24e2bSVaishali Kulkarni%\chapter{Statistics}
1770*14b24e2bSVaishali Kulkarni%\begin{NOTICE}
1771*14b24e2bSVaishali Kulkarni%Placeholder - owner Dmitry
1772*14b24e2bSVaishali Kulkarni%\end{NOTICE}
1773*14b24e2bSVaishali Kulkarni
1774*14b24e2bSVaishali Kulkarni%\chapterimage{qlogic-full-36}
1775*14b24e2bSVaishali Kulkarni%\chapter{Peripherals}
1776*14b24e2bSVaishali Kulkarni%\begin{NOTICE}
1777*14b24e2bSVaishali Kulkarni%Placeholder - owner Yuval
1778*14b24e2bSVaishali Kulkarni%\end{NOTICE}
1779*14b24e2bSVaishali Kulkarni
1780*14b24e2bSVaishali Kulkarni\appendix
1781*14b24e2bSVaishali Kulkarni\chapter{Osal Documentation}
1782*14b24e2bSVaishali Kulkarni\label{app:osal}
1783*14b24e2bSVaishali Kulkarni\verbatiminput{osal.txt}
1784*14b24e2bSVaishali Kulkarni
1785*14b24e2bSVaishali Kulkarni%----------------------------------------------------------------------------------------
1786*14b24e2bSVaishali Kulkarni%	INDEX
1787*14b24e2bSVaishali Kulkarni%----------------------------------------------------------------------------------------
1788*14b24e2bSVaishali Kulkarni
1789*14b24e2bSVaishali Kulkarni\cleardoublepage
1790*14b24e2bSVaishali Kulkarni\setlength{\columnsep}{0.75cm}
1791*14b24e2bSVaishali Kulkarni%\addcontentsline{toc}{chapter}{\textcolor{ocre}{Index}}
1792*14b24e2bSVaishali Kulkarni\printindex
1793*14b24e2bSVaishali Kulkarni
1794*14b24e2bSVaishali Kulkarni\bibliography{ecore}
1795*14b24e2bSVaishali Kulkarni%----------------------------------------------------------------------------------------
1796*14b24e2bSVaishali Kulkarni\end{document}
1797