-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsksvmloaderfiledoc.html
More file actions
359 lines (244 loc) · 17 KB
/
sksvmloaderfiledoc.html
File metadata and controls
359 lines (244 loc) · 17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>load_svmlight_file — Snap Machine Learning documentation</title>
<link rel="shortcut icon" href="_static/favicon.ico"/>
<script type="text/javascript" src="_static/js/modernizr.min.js"></script>
<script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
<script type="text/javascript" src="_static/jquery.js"></script>
<script type="text/javascript" src="_static/underscore.js"></script>
<script type="text/javascript" src="_static/doctools.js"></script>
<script type="text/javascript" src="_static/language_data.js"></script>
<script type="text/javascript" src="_static/js/theme.js"></script>
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="log_loss" href="sklogdoc.html" />
<link rel="prev" title="decomposition.TruncatedSVD" href="svddoc.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home"> Snap Machine Learning
</a>
<div class="version">
1.3.0
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<p class="caption"><span class="caption-text">Overview</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="manual.html">Manual</a></li>
<li class="toctree-l1"><a class="reference internal" href="tutorials.html">Tutorials</a></li>
<li class="toctree-l1"><a class="reference internal" href="frequentlyaskedquestions.html">FAQ</a></li>
</ul>
<p class="caption"><span class="caption-text">pai4sk ML APIs</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="ridgedoc.html">linear_model.Ridge</a></li>
<li class="toctree-l1"><a class="reference internal" href="lassodoc.html">linear_model.Lasso</a></li>
<li class="toctree-l1"><a class="reference internal" href="sklogregdoc.html">linear_model.LogisticRegression</a></li>
<li class="toctree-l1"><a class="reference internal" href="svcdoc.html">svm.LinearSVC</a></li>
<li class="toctree-l1"><a class="reference internal" href="kmeansdoc.html">cluster.KMeans</a></li>
<li class="toctree-l1"><a class="reference internal" href="dbscandoc.html">cluster.DBSCAN</a></li>
<li class="toctree-l1"><a class="reference internal" href="pcadoc.html">decomposition.PCA</a></li>
<li class="toctree-l1"><a class="reference internal" href="svddoc.html">decomposition.TruncatedSVD</a></li>
</ul>
<p class="caption"><span class="caption-text">pai4sk Loaders APIs</span></p>
<ul class="current">
<li class="toctree-l1 current"><a class="current reference internal" href="#">load_svmlight_file</a></li>
</ul>
<p class="caption"><span class="caption-text">pai4sk Metrics APIs</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="sklogdoc.html">log_loss</a></li>
<li class="toctree-l1"><a class="reference internal" href="skaccdoc.html">accuracy_score</a></li>
<li class="toctree-l1"><a class="reference internal" href="skhingedoc.html">hinge_loss</a></li>
<li class="toctree-l1"><a class="reference internal" href="skmsedoc.html">mean_squared_error</a></li>
</ul>
<p class="caption"><span class="caption-text">snapML APIs</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="linregapidoc.html">LinearRegression</a></li>
<li class="toctree-l1"><a class="reference internal" href="logregapidoc.html">LogisticRegression</a></li>
<li class="toctree-l1"><a class="reference internal" href="svmapidoc.html">SVM</a></li>
<li class="toctree-l1"><a class="reference internal" href="dectreeapidoc.html">DecisionTreeClassifier</a></li>
<li class="toctree-l1"><a class="reference internal" href="ranforapidoc.html">RandomForestClassifier</a></li>
<li class="toctree-l1"><a class="reference internal" href="logdoc.html">log_loss</a></li>
<li class="toctree-l1"><a class="reference internal" href="accdoc.html">accuracy_score</a></li>
<li class="toctree-l1"><a class="reference internal" href="hingedoc.html">hinge_loss</a></li>
<li class="toctree-l1"><a class="reference internal" href="msedoc.html">mean_squared_error</a></li>
</ul>
<p class="caption"><span class="caption-text">snapML Loaders APIs</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="svmloaderdoc.html">load_from_svmlight_format</a></li>
<li class="toctree-l1"><a class="reference internal" href="snaploaderdoc.html">load_from_snap_format</a></li>
<li class="toctree-l1"><a class="reference internal" href="snaploaderfiledoc.html">load_snap_file</a></li>
<li class="toctree-l1"><a class="reference internal" href="snapwritedoc.html">write_to_snap_format</a></li>
</ul>
<p class="caption"><span class="caption-text">snapML Spark APIs</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="splinregdoc.html">LinearRegression</a></li>
<li class="toctree-l1"><a class="reference internal" href="splogregdoc.html">LogisticRegression</a></li>
<li class="toctree-l1"><a class="reference internal" href="spsvmdoc.html">SupportVectorMachine</a></li>
<li class="toctree-l1"><a class="reference internal" href="spreaddoc.html">DatasetReader</a></li>
<li class="toctree-l1"><a class="reference internal" href="spmetdoc.html">Metrics</a></li>
<li class="toctree-l1"><a class="reference internal" href="sputildoc.html">Utils</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">Snap Machine Learning</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html">Docs</a> »</li>
<li>load_svmlight_file</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<div class="section" id="load-svmlight-file">
<span id="sk-svmloader-file-doc"></span><h1>load_svmlight_file<a class="headerlink" href="#load-svmlight-file" title="Permalink to this headline">¶</a></h1>
<dl class="function">
<dt id="pai4sk.datasets.load_svmlight_file">
<code class="descclassname">pai4sk.datasets.</code><code class="descname">load_svmlight_file</code><span class="sig-paren">(</span><em>f</em>, <em>n_features=None</em>, <em>dtype=<class 'numpy.float64'></em>, <em>multilabel=False</em>, <em>zero_based='auto'</em>, <em>query_id=False</em>, <em>offset=0</em>, <em>length=-1</em><span class="sig-paren">)</span><a class="headerlink" href="#pai4sk.datasets.load_svmlight_file" title="Permalink to this definition">¶</a></dt>
<dd><p>Load datasets in the svmlight / libsvm format into sparse CSR matrix</p>
<p>This format is a text-based format, with one sample per line. It does
not store zero valued features hence is suitable for sparse dataset.</p>
<p>The first element of each line can be used to store a target variable
to predict.</p>
<p>This format is used as the default format for both svmlight and the
libsvm command line programs.</p>
<p>Parsing a text based source can be expensive. When working on
repeatedly on the same dataset, it is recommended to wrap this
loader with joblib.Memory.cache to store a memmapped backup of the
CSR results of the first call and benefit from the near instantaneous
loading of memmapped structures for the subsequent calls.</p>
<p>In case the file contains a pairwise preference constraint (known
as “qid” in the svmlight format) these are ignored unless the
query_id parameter is set to True. These pairwise preference
constraints can be used to constraint the combination of samples
when using pairwise loss functions (as is the case in some
learning to rank problems) so that only pairs with the same
query_id value are considered.</p>
<p>This implementation is written in Cython and is reasonably fast.
However, a faster API-compatible loader is also available at:</p>
<blockquote>
<div><a class="reference external" href="https://github.com/mblondel/svmlight-loader">https://github.com/mblondel/svmlight-loader</a></div></blockquote>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>f</strong> (<em>{str</em><em>, </em><em>file-like</em><em>, </em><em>int}</em>) – (Path to) a file to load. If a path ends in “.gz” or “.bz2”, it will
be uncompressed on the fly. If an integer is passed, it is assumed to
be a file descriptor. A file-like or file descriptor will not be closed
by this function. A file-like object must be opened in binary mode.</li>
<li><strong>n_features</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.7)"><em>int</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.7)"><em>None</em></a>) – The number of features to use. If None, it will be inferred. This
argument is useful to load several files that are subsets of a
bigger sliced dataset: each subset might not have examples of
every feature, hence the inferred shape might vary from one
slice to another.
n_features is only required if <code class="docutils literal notranslate"><span class="pre">offset</span></code> or <code class="docutils literal notranslate"><span class="pre">length</span></code> are passed a
non-default value.</li>
<li><strong>dtype</strong> (<em>numpy data type</em><em>, </em><em>default np.float64</em>) – Data type of dataset to be loaded. This will be the data type of the
output numpy arrays <code class="docutils literal notranslate"><span class="pre">X</span></code> and <code class="docutils literal notranslate"><span class="pre">y</span></code>.</li>
<li><strong>multilabel</strong> (<em>boolean</em><em>, </em><em>optional</em><em>, </em><em>default False</em>) – Samples may have several labels each (see
<a class="reference external" href="https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html">https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html</a>)</li>
<li><strong>zero_based</strong> (<em>boolean</em><em> or </em><em>"auto"</em><em>, </em><em>optional</em><em>, </em><em>default "auto"</em>) – Whether column indices in f are zero-based (True) or one-based
(False). If column indices are one-based, they are transformed to
zero-based to match Python/NumPy conventions.
If set to “auto”, a heuristic check is applied to determine this from
the file contents. Both kinds of files occur “in the wild”, but they
are unfortunately not self-identifying. Using “auto” or True should
always be safe when no <code class="docutils literal notranslate"><span class="pre">offset</span></code> or <code class="docutils literal notranslate"><span class="pre">length</span></code> is passed.
If <code class="docutils literal notranslate"><span class="pre">offset</span></code> or <code class="docutils literal notranslate"><span class="pre">length</span></code> are passed, the “auto” mode falls back
to <code class="docutils literal notranslate"><span class="pre">zero_based=True</span></code> to avoid having the heuristic check yield
inconsistent results on different segments of the file.</li>
<li><strong>query_id</strong> (<em>boolean</em><em>, </em><em>default False</em>) – If True, will return the query_id array for each file.</li>
<li><strong>offset</strong> (<em>integer</em><em>, </em><em>optional</em><em>, </em><em>default 0</em>) – Ignore the offset first bytes by seeking forward, then
discarding the following bytes up until the next new line
character.</li>
<li><strong>length</strong> (<em>integer</em><em>, </em><em>optional</em><em>, </em><em>default -1</em>) – If strictly positive, stop reading any new line of data once the
position in the file has reached the (offset + length) bytes threshold.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last"><ul class="simple">
<li><strong>X</strong> (<em>scipy.sparse matrix of shape (n_samples, n_features)</em>)</li>
<li><strong>y</strong> (<em>ndarray of shape (n_samples,), or, in the multilabel a list of</em>) – tuples of length n_samples.</li>
<li><strong>query_id</strong> (<em>array of shape (n_samples,)</em>) – query_id for each sample. Only returned when query_id is set to
True.</li>
</ul>
</p>
</td>
</tr>
</tbody>
</table>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="docutils">
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">load_svmlight_files()</span></code></dt>
<dd>similar function for loading multiple files in this</dd>
</dl>
<p class="last"><a class="reference external" href="https://docs.python.org/3/library/functions.html#format" title="(in Python v3.7)"><code class="xref py py-func docutils literal notranslate"><span class="pre">format()</span></code></a>, <code class="xref py py-func docutils literal notranslate"><span class="pre">enforcing()</span></code></p>
</div>
<p class="rubric">Examples</p>
<p>To use joblib.Memory to cache the svmlight file:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">joblib</span> <span class="k">import</span> <span class="n">Memory</span>
<span class="kn">from</span> <span class="nn">pai4sk.datasets</span> <span class="k">import</span> <span class="n">load_svmlight_file</span>
<span class="n">mem</span> <span class="o">=</span> <span class="n">Memory</span><span class="p">(</span><span class="s2">"./mycache"</span><span class="p">)</span>
<span class="nd">@mem</span><span class="o">.</span><span class="n">cache</span>
<span class="k">def</span> <span class="nf">get_data</span><span class="p">():</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">load_svmlight_file</span><span class="p">(</span><span class="s2">"mysvmlightfile"</span><span class="p">)</span>
<span class="k">return</span> <span class="n">data</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">data</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">get_data</span><span class="p">()</span>
</pre></div>
</div>
</dd></dl>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="sklogdoc.html" class="btn btn-neutral float-right" title="log_loss" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
<a href="svddoc.html" class="btn btn-neutral float-left" title="decomposition.TruncatedSVD" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
© Copyright IBM Corporation 2018, 2019
</p>
</div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>