1
;
2
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3
;
4
; Use of this source code is governed by a BSD-style license
5
; that can be found in the LICENSE file in the root of the source
6
; tree. An additional intellectual property rights grant can be found
7
; in the file PATENTS. All contributing project authors may
8
; be found in the AUTHORS file in the root of the source tree.
9
;
10
11
%include "third_party/x86inc/x86inc.asm"
12
13
SECTION_RODATA
14
pw_8: times 8 dw 8
15
bilin_filter_m_sse2: times 8 dw 16
16
times 8 dw 0
17
times 8 dw 14
18
times 8 dw 2
19
times 8 dw 12
20
times 8 dw 4
21
times 8 dw 10
22
times 8 dw 6
23
times 16 dw 8
24
times 8 dw 6
25
times 8 dw 10
26
times 8 dw 4
27
times 8 dw 12
28
times 8 dw 2
29
times 8 dw 14
30
31
SECTION .text
32
33
; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
34
; int x_offset, int y_offset,
35
; const uint8_t *ref, ptrdiff_t ref_stride,
36
; int height, unsigned int *sse);
37
;
38
; This function returns the SE and stores SSE in the given pointer.
39
40
%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
41
psubw %3, %4
42
psubw %1, %2
43
mova %4, %3 ; make copies to manipulate to calc sum
44
mova %2, %1 ; use originals for calc sse
45
pmaddwd %3, %3
46
paddw %4, %2
47
pmaddwd %1, %1
48
movhlps %2, %4
49
paddd %6, %3
50
paddw %4, %2
51
pxor %2, %2
52
pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
53
punpcklwd %4, %2 ; sign-extend word to dword
54
paddd %6, %1
55
paddd %5, %4
56
57
%endmacro
58
59
%macro STORE_AND_RET 0
60
%if mmsize == 16
61
; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
62
; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
63
; We have to sign-extend it before adding the words within the register
64
; and outputing to a dword.
65
movhlps m3, m7
66
movhlps m4, m6
67
paddd m7, m3
68
paddd m6, m4
69
pshufd m3, m7, 0x1
70
pshufd m4, m6, 0x1
71
paddd m7, m3
72
paddd m6, m4
73
mov r1, ssem ; r1 = unsigned int *sse
74
movd [r1], m7 ; store sse
75
movd eax, m6 ; store sum as return value
76
%endif
77
RET
78
%endmacro
79
80
%macro INC_SRC_BY_SRC_STRIDE 0
81
%if ARCH_X86=1 && CONFIG_PIC=1
82
add srcq, src_stridemp
83
add srcq, src_stridemp
84
%else
85
lea srcq, [srcq + src_strideq*2]
86
%endif
87
%endmacro
88
89
%macro SUBPEL_VARIANCE 1-2 0 ; W
90
%define bilin_filter_m bilin_filter_m_sse2
91
%define filter_idx_shift 5
92
93
94
%if ARCH_X86_64
95
%if %2 == 1 ; avg
96
cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
97
x_offset, y_offset, \
98
ref, ref_stride, \
99
second_pred, second_stride, height, sse
100
%define second_str second_strideq
101
%else
102
cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
103
x_offset, y_offset, \
104
ref, ref_stride, height, sse
105
%endif
106
%define block_height heightd
107
%define bilin_filter sseq
108
%else
109
%if CONFIG_PIC=1
110
%if %2 == 1 ; avg
111
cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
112
x_offset, y_offset, \
113
ref, ref_stride, \
114
second_pred, second_stride, height, sse
115
%define block_height dword heightm
116
%define second_str second_stridemp
117
%else
118
cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
119
x_offset, y_offset, \
120
ref, ref_stride, height, sse
121
%define block_height heightd
122
%endif
123
124
; reuse argument stack space
125
%define g_bilin_filterm x_offsetm
126
%define g_pw_8m y_offsetm
127
128
; Store bilin_filter and pw_8 location in stack
129
%if GET_GOT_DEFINED == 1
130
GET_GOT eax
131
add esp, 4 ; restore esp
132
%endif
133
134
lea ecx, [GLOBAL(bilin_filter_m)]
135
mov g_bilin_filterm, ecx
136
137
lea ecx, [GLOBAL(pw_8)]
138
mov g_pw_8m, ecx
139
140
LOAD_IF_USED 0, 1 ; load eax, ecx back
141
%else
142
%if %2 == 1 ; avg
143
cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
144
x_offset, y_offset, \
145
ref, ref_stride, \
146
second_pred, second_stride, height, sse
147
%define block_height dword heightm
148
%define second_str second_stridemp
149
%else
150
cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
151
x_offset, y_offset, \
152
ref, ref_stride, height, sse
153
%define block_height heightd
154
%endif
155
156
%define bilin_filter bilin_filter_m
157
%endif
158
%endif
159
160
ASSERT %1 <= 16 ; m6 overflows if w > 16
161
pxor m6, m6 ; sum
162
pxor m7, m7 ; sse
163
164
%if %1 < 16
165
sar block_height, 1
166
%endif
167
%if %2 == 1 ; avg
168
shl second_str, 1
169
%endif
170
171
; FIXME(rbultje) replace by jumptable?
172
test x_offsetd, x_offsetd
173
jnz .x_nonzero
174
; x_offset == 0
175
test y_offsetd, y_offsetd
176
jnz .x_zero_y_nonzero
177
178
; x_offset == 0 && y_offset == 0
179
.x_zero_y_zero_loop:
180
%if %1 == 16
181
movu m0, [srcq]
182
movu m2, [srcq + 16]
183
mova m1, [refq]
184
mova m3, [refq + 16]
185
%if %2 == 1 ; avg
186
pavgw m0, [second_predq]
187
pavgw m2, [second_predq+16]
188
%endif
189
SUM_SSE m0, m1, m2, m3, m6, m7
190
191
lea srcq, [srcq + src_strideq*2]
192
lea refq, [refq + ref_strideq*2]
193
%if %2 == 1 ; avg
194
add second_predq, second_str
195
%endif
196
%else ; %1 < 16
197
movu m0, [srcq]
198
movu m2, [srcq + src_strideq*2]
199
mova m1, [refq]
200
mova m3, [refq + ref_strideq*2]
201
%if %2 == 1 ; avg
202
pavgw m0, [second_predq]
203
add second_predq, second_str
204
pavgw m2, [second_predq]
205
%endif
206
SUM_SSE m0, m1, m2, m3, m6, m7
207
208
lea srcq, [srcq + src_strideq*4]
209
lea refq, [refq + ref_strideq*4]
210
%if %2 == 1 ; avg
211
add second_predq, second_str
212
%endif
213
%endif
214
dec block_height
215
jg .x_zero_y_zero_loop
216
STORE_AND_RET
217
218
.x_zero_y_nonzero:
219
cmp y_offsetd, 8
220
jne .x_zero_y_nonhalf
221
222
; x_offset == 0 && y_offset == 0.5
223
.x_zero_y_half_loop:
224
%if %1 == 16
225
movu m0, [srcq]
226
movu m1, [srcq+16]
227
movu m4, [srcq+src_strideq*2]
228
movu m5, [srcq+src_strideq*2+16]
229
mova m2, [refq]
230
mova m3, [refq+16]
231
pavgw m0, m4
232
pavgw m1, m5
233
%if %2 == 1 ; avg
234
pavgw m0, [second_predq]
235
pavgw m1, [second_predq+16]
236
%endif
237
SUM_SSE m0, m2, m1, m3, m6, m7
238
239
lea srcq, [srcq + src_strideq*2]
240
lea refq, [refq + ref_strideq*2]
241
%if %2 == 1 ; avg
242
add second_predq, second_str
243
%endif
244
%else ; %1 < 16
245
movu m0, [srcq]
246
movu m1, [srcq+src_strideq*2]
247
movu m5, [srcq+src_strideq*4]
248
mova m2, [refq]
249
mova m3, [refq+ref_strideq*2]
250
pavgw m0, m1
251
pavgw m1, m5
252
%if %2 == 1 ; avg
253
pavgw m0, [second_predq]
254
add second_predq, second_str
255
See Alsoarch/arm64/boot/dts/rockchip/rk3399-firefly.dts - master | Gitverse[PDF] SERVICE MANUAL - Free Download PDFpavgw m1, [second_predq]
256
%endif
257
SUM_SSE m0, m2, m1, m3, m6, m7
258
259
lea srcq, [srcq + src_strideq*4]
260
lea refq, [refq + ref_strideq*4]
261
%if %2 == 1 ; avg
262
add second_predq, second_str
263
%endif
264
%endif
265
dec block_height
266
jg .x_zero_y_half_loop
267
STORE_AND_RET
268
269
.x_zero_y_nonhalf:
270
; x_offset == 0 && y_offset == bilin interpolation
271
%if ARCH_X86_64
272
lea bilin_filter, [GLOBAL(bilin_filter_m)]
273
%endif
274
shl y_offsetd, filter_idx_shift
275
%if ARCH_X86_64 && mmsize == 16
276
mova m8, [bilin_filter+y_offsetq]
277
mova m9, [bilin_filter+y_offsetq+16]
278
mova m10, [GLOBAL(pw_8)]
279
%define filter_y_a m8
280
%define filter_y_b m9
281
%define filter_rnd m10
282
%else ; x86-32 or mmx
283
%if ARCH_X86=1 && CONFIG_PIC=1
284
; x_offset == 0, reuse x_offset reg
285
%define tempq x_offsetq
286
add y_offsetq, g_bilin_filterm
287
%define filter_y_a [y_offsetq]
288
%define filter_y_b [y_offsetq+16]
289
mov tempq, g_pw_8m
290
%define filter_rnd [tempq]
291
%else
292
add y_offsetq, bilin_filter
293
%define filter_y_a [y_offsetq]
294
%define filter_y_b [y_offsetq+16]
295
%define filter_rnd [GLOBAL(pw_8)]
296
%endif
297
%endif
298
299
.x_zero_y_other_loop:
300
%if %1 == 16
301
movu m0, [srcq]
302
movu m1, [srcq + 16]
303
movu m4, [srcq+src_strideq*2]
304
movu m5, [srcq+src_strideq*2+16]
305
mova m2, [refq]
306
mova m3, [refq+16]
307
; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
308
; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
309
; instructions is the same (5), but it is 1 mul instead of 2, so might be
310
; slightly faster because of pmullw latency. It would also cut our rodata
311
; tables in half for this function, and save 1-2 registers on x86-64.
312
pmullw m1, filter_y_a
313
pmullw m5, filter_y_b
314
paddw m1, filter_rnd
315
pmullw m0, filter_y_a
316
pmullw m4, filter_y_b
317
paddw m0, filter_rnd
318
paddw m1, m5
319
paddw m0, m4
320
psrlw m1, 4
321
psrlw m0, 4
322
%if %2 == 1 ; avg
323
pavgw m0, [second_predq]
324
pavgw m1, [second_predq+16]
325
%endif
326
SUM_SSE m0, m2, m1, m3, m6, m7
327
328
lea srcq, [srcq + src_strideq*2]
329
lea refq, [refq + ref_strideq*2]
330
%if %2 == 1 ; avg
331
add second_predq, second_str
332
%endif
333
%else ; %1 < 16
334
movu m0, [srcq]
335
movu m1, [srcq+src_strideq*2]
336
movu m5, [srcq+src_strideq*4]
337
mova m4, m1
338
mova m2, [refq]
339
mova m3, [refq+ref_strideq*2]
340
pmullw m1, filter_y_a
341
pmullw m5, filter_y_b
342
paddw m1, filter_rnd
343
pmullw m0, filter_y_a
344
pmullw m4, filter_y_b
345
paddw m0, filter_rnd
346
paddw m1, m5
347
paddw m0, m4
348
psrlw m1, 4
349
psrlw m0, 4
350
%if %2 == 1 ; avg
351
pavgw m0, [second_predq]
352
add second_predq, second_str
353
pavgw m1, [second_predq]
354
%endif
355
SUM_SSE m0, m2, m1, m3, m6, m7
356
357
lea srcq, [srcq + src_strideq*4]
358
lea refq, [refq + ref_strideq*4]
359
%if %2 == 1 ; avg
360
add second_predq, second_str
361
%endif
362
%endif
363
dec block_height
364
jg .x_zero_y_other_loop
365
%undef filter_y_a
366
%undef filter_y_b
367
%undef filter_rnd
368
STORE_AND_RET
369
370
.x_nonzero:
371
cmp x_offsetd, 8
372
jne .x_nonhalf
373
; x_offset == 0.5
374
test y_offsetd, y_offsetd
375
jnz .x_half_y_nonzero
376
377
; x_offset == 0.5 && y_offset == 0
378
.x_half_y_zero_loop:
379
%if %1 == 16
380
movu m0, [srcq]
381
movu m1, [srcq + 16]
382
movu m4, [srcq + 2]
383
movu m5, [srcq + 18]
384
mova m2, [refq]
385
mova m3, [refq + 16]
386
pavgw m0, m4
387
pavgw m1, m5
388
%if %2 == 1 ; avg
389
pavgw m0, [second_predq]
390
pavgw m1, [second_predq+16]
391
%endif
392
SUM_SSE m0, m2, m1, m3, m6, m7
393
394
lea srcq, [srcq + src_strideq*2]
395
lea refq, [refq + ref_strideq*2]
396
%if %2 == 1 ; avg
397
add second_predq, second_str
398
%endif
399
%else ; %1 < 16
400
movu m0, [srcq]
401
movu m1, [srcq + src_strideq*2]
402
movu m4, [srcq + 2]
403
movu m5, [srcq + src_strideq*2 + 2]
404
mova m2, [refq]
405
mova m3, [refq + ref_strideq*2]
406
pavgw m0, m4
407
pavgw m1, m5
408
%if %2 == 1 ; avg
409
pavgw m0, [second_predq]
410
add second_predq, second_str
411
pavgw m1, [second_predq]
412
%endif
413
SUM_SSE m0, m2, m1, m3, m6, m7
414
415
lea srcq, [srcq + src_strideq*4]
416
lea refq, [refq + ref_strideq*4]
417
%if %2 == 1 ; avg
418
add second_predq, second_str
419
%endif
420
%endif
421
dec block_height
422
jg .x_half_y_zero_loop
423
STORE_AND_RET
424
425
.x_half_y_nonzero:
426
cmp y_offsetd, 8
427
jne .x_half_y_nonhalf
428
429
; x_offset == 0.5 && y_offset == 0.5
430
%if %1 == 16
431
movu m0, [srcq]
432
movu m1, [srcq+16]
433
movu m2, [srcq+2]
434
movu m3, [srcq+18]
435
lea srcq, [srcq + src_strideq*2]
436
pavgw m0, m2
437
pavgw m1, m3
438
.x_half_y_half_loop:
439
movu m2, [srcq]
440
movu m3, [srcq + 16]
441
movu m4, [srcq + 2]
442
movu m5, [srcq + 18]
443
pavgw m2, m4
444
pavgw m3, m5
445
pavgw m0, m2
446
pavgw m1, m3
447
mova m4, [refq]
448
mova m5, [refq + 16]
449
%if %2 == 1 ; avg
450
pavgw m0, [second_predq]
451
pavgw m1, [second_predq+16]
452
%endif
453
SUM_SSE m0, m4, m1, m5, m6, m7
454
mova m0, m2
455
mova m1, m3
456
457
lea srcq, [srcq + src_strideq*2]
458
lea refq, [refq + ref_strideq*2]
459
%if %2 == 1 ; avg
460
add second_predq, second_str
461
%endif
462
%else ; %1 < 16
463
movu m0, [srcq]
464
movu m2, [srcq+2]
465
lea srcq, [srcq + src_strideq*2]
466
pavgw m0, m2
467
.x_half_y_half_loop:
468
movu m2, [srcq]
469
movu m3, [srcq + src_strideq*2]
470
movu m4, [srcq + 2]
471
movu m5, [srcq + src_strideq*2 + 2]
472
pavgw m2, m4
473
pavgw m3, m5
474
pavgw m0, m2
475
pavgw m2, m3
476
mova m4, [refq]
477
mova m5, [refq + ref_strideq*2]
478
%if %2 == 1 ; avg
479
pavgw m0, [second_predq]
480
add second_predq, second_str
481
pavgw m2, [second_predq]
482
%endif
483
SUM_SSE m0, m4, m2, m5, m6, m7
484
mova m0, m3
485
486
lea srcq, [srcq + src_strideq*4]
487
lea refq, [refq + ref_strideq*4]
488
%if %2 == 1 ; avg
489
add second_predq, second_str
490
%endif
491
%endif
492
dec block_height
493
jg .x_half_y_half_loop
494
STORE_AND_RET
495
496
.x_half_y_nonhalf:
497
; x_offset == 0.5 && y_offset == bilin interpolation
498
%if ARCH_X86_64
499
lea bilin_filter, [GLOBAL(bilin_filter_m)]
500
%endif
501
shl y_offsetd, filter_idx_shift
502
%if ARCH_X86_64 && mmsize == 16
503
mova m8, [bilin_filter+y_offsetq]
504
mova m9, [bilin_filter+y_offsetq+16]
505
mova m10, [GLOBAL(pw_8)]
506
%define filter_y_a m8
507
%define filter_y_b m9
508
%define filter_rnd m10
509
%else ; x86_32
510
%if ARCH_X86=1 && CONFIG_PIC=1
511
; x_offset == 0.5. We can reuse x_offset reg
512
%define tempq x_offsetq
513
add y_offsetq, g_bilin_filterm
514
%define filter_y_a [y_offsetq]
515
%define filter_y_b [y_offsetq+16]
516
mov tempq, g_pw_8m
517
%define filter_rnd [tempq]
518
%else
519
add y_offsetq, bilin_filter
520
%define filter_y_a [y_offsetq]
521
%define filter_y_b [y_offsetq+16]
522
%define filter_rnd [GLOBAL(pw_8)]
523
%endif
524
%endif
525
526
%if %1 == 16
527
movu m0, [srcq]
528
movu m1, [srcq+16]
529
movu m2, [srcq+2]
530
movu m3, [srcq+18]
531
lea srcq, [srcq + src_strideq*2]
532
pavgw m0, m2
533
pavgw m1, m3
534
.x_half_y_other_loop:
535
movu m2, [srcq]
536
movu m3, [srcq+16]
537
movu m4, [srcq+2]
538
movu m5, [srcq+18]
539
pavgw m2, m4
540
pavgw m3, m5
541
mova m4, m2
542
mova m5, m3
543
pmullw m1, filter_y_a
544
pmullw m3, filter_y_b
545
paddw m1, filter_rnd
546
paddw m1, m3
547
pmullw m0, filter_y_a
548
pmullw m2, filter_y_b
549
paddw m0, filter_rnd
550
psrlw m1, 4
551
paddw m0, m2
552
mova m2, [refq]
553
psrlw m0, 4
554
mova m3, [refq+16]
555
%if %2 == 1 ; avg
556
pavgw m0, [second_predq]
557
pavgw m1, [second_predq+16]
558
%endif
559
SUM_SSE m0, m2, m1, m3, m6, m7
560
mova m0, m4
561
mova m1, m5
562
563
lea srcq, [srcq + src_strideq*2]
564
lea refq, [refq + ref_strideq*2]
565
%if %2 == 1 ; avg
566
add second_predq, second_str
567
%endif
568
%else ; %1 < 16
569
movu m0, [srcq]
570
movu m2, [srcq+2]
571
lea srcq, [srcq + src_strideq*2]
572
pavgw m0, m2
573
.x_half_y_other_loop:
574
movu m2, [srcq]
575
movu m3, [srcq+src_strideq*2]
576
movu m4, [srcq+2]
577
movu m5, [srcq+src_strideq*2+2]
578
pavgw m2, m4
579
pavgw m3, m5
580
mova m4, m2
581
mova m5, m3
582
pmullw m4, filter_y_a
583
pmullw m3, filter_y_b
584
paddw m4, filter_rnd
585
paddw m4, m3
586
pmullw m0, filter_y_a
587
pmullw m2, filter_y_b
588
paddw m0, filter_rnd
589
psrlw m4, 4
590
paddw m0, m2
591
mova m2, [refq]
592
psrlw m0, 4
593
mova m3, [refq+ref_strideq*2]
594
%if %2 == 1 ; avg
595
pavgw m0, [second_predq]
596
add second_predq, second_str
597
pavgw m4, [second_predq]
598
%endif
599
SUM_SSE m0, m2, m4, m3, m6, m7
600
mova m0, m5
601
602
lea srcq, [srcq + src_strideq*4]
603
lea refq, [refq + ref_strideq*4]
604
%if %2 == 1 ; avg
605
add second_predq, second_str
606
%endif
607
%endif
608
dec block_height
609
jg .x_half_y_other_loop
610
%undef filter_y_a
611
%undef filter_y_b
612
%undef filter_rnd
613
STORE_AND_RET
614
615
.x_nonhalf:
616
test y_offsetd, y_offsetd
617
jnz .x_nonhalf_y_nonzero
618
619
; x_offset == bilin interpolation && y_offset == 0
620
%if ARCH_X86_64
621
lea bilin_filter, [GLOBAL(bilin_filter_m)]
622
%endif
623
shl x_offsetd, filter_idx_shift
624
%if ARCH_X86_64 && mmsize == 16
625
mova m8, [bilin_filter+x_offsetq]
626
mova m9, [bilin_filter+x_offsetq+16]
627
mova m10, [GLOBAL(pw_8)]
628
%define filter_x_a m8
629
%define filter_x_b m9
630
%define filter_rnd m10
631
%else ; x86-32
632
%if ARCH_X86=1 && CONFIG_PIC=1
633
; y_offset == 0. We can reuse y_offset reg.
634
%define tempq y_offsetq
635
add x_offsetq, g_bilin_filterm
636
%define filter_x_a [x_offsetq]
637
%define filter_x_b [x_offsetq+16]
638
mov tempq, g_pw_8m
639
%define filter_rnd [tempq]
640
%else
641
add x_offsetq, bilin_filter
642
%define filter_x_a [x_offsetq]
643
%define filter_x_b [x_offsetq+16]
644
%define filter_rnd [GLOBAL(pw_8)]
645
%endif
646
%endif
647
648
.x_other_y_zero_loop:
649
%if %1 == 16
650
movu m0, [srcq]
651
movu m1, [srcq+16]
652
movu m2, [srcq+2]
653
movu m3, [srcq+18]
654
mova m4, [refq]
655
mova m5, [refq+16]
656
pmullw m1, filter_x_a
657
pmullw m3, filter_x_b
658
paddw m1, filter_rnd
659
pmullw m0, filter_x_a
660
pmullw m2, filter_x_b
661
paddw m0, filter_rnd
662
paddw m1, m3
663
paddw m0, m2
664
psrlw m1, 4
665
psrlw m0, 4
666
%if %2 == 1 ; avg
667
pavgw m0, [second_predq]
668
pavgw m1, [second_predq+16]
669
%endif
670
SUM_SSE m0, m4, m1, m5, m6, m7
671
672
lea srcq, [srcq+src_strideq*2]
673
lea refq, [refq+ref_strideq*2]
674
%if %2 == 1 ; avg
675
add second_predq, second_str
676
%endif
677
%else ; %1 < 16
678
movu m0, [srcq]
679
movu m1, [srcq+src_strideq*2]
680
movu m2, [srcq+2]
681
movu m3, [srcq+src_strideq*2+2]
682
mova m4, [refq]
683
mova m5, [refq+ref_strideq*2]
684
pmullw m1, filter_x_a
685
pmullw m3, filter_x_b
686
paddw m1, filter_rnd
687
pmullw m0, filter_x_a
688
pmullw m2, filter_x_b
689
paddw m0, filter_rnd
690
paddw m1, m3
691
paddw m0, m2
692
psrlw m1, 4
693
psrlw m0, 4
694
%if %2 == 1 ; avg
695
pavgw m0, [second_predq]
696
add second_predq, second_str
697
pavgw m1, [second_predq]
698
%endif
699
SUM_SSE m0, m4, m1, m5, m6, m7
700
701
lea srcq, [srcq+src_strideq*4]
702
lea refq, [refq+ref_strideq*4]
703
%if %2 == 1 ; avg
704
add second_predq, second_str
705
%endif
706
%endif
707
dec block_height
708
jg .x_other_y_zero_loop
709
%undef filter_x_a
710
%undef filter_x_b
711
%undef filter_rnd
712
STORE_AND_RET
713
714
.x_nonhalf_y_nonzero:
715
cmp y_offsetd, 8
716
jne .x_nonhalf_y_nonhalf
717
718
; x_offset == bilin interpolation && y_offset == 0.5
719
%if ARCH_X86_64
720
lea bilin_filter, [GLOBAL(bilin_filter_m)]
721
%endif
722
shl x_offsetd, filter_idx_shift
723
%if ARCH_X86_64 && mmsize == 16
724
mova m8, [bilin_filter+x_offsetq]
725
mova m9, [bilin_filter+x_offsetq+16]
726
mova m10, [GLOBAL(pw_8)]
727
%define filter_x_a m8
728
%define filter_x_b m9
729
%define filter_rnd m10
730
%else ; x86-32
731
%if ARCH_X86=1 && CONFIG_PIC=1
732
; y_offset == 0.5. We can reuse y_offset reg.
733
%define tempq y_offsetq
734
add x_offsetq, g_bilin_filterm
735
%define filter_x_a [x_offsetq]
736
%define filter_x_b [x_offsetq+16]
737
mov tempq, g_pw_8m
738
%define filter_rnd [tempq]
739
%else
740
add x_offsetq, bilin_filter
741
%define filter_x_a [x_offsetq]
742
%define filter_x_b [x_offsetq+16]
743
%define filter_rnd [GLOBAL(pw_8)]
744
%endif
745
%endif
746
747
%if %1 == 16
748
movu m0, [srcq]
749
movu m1, [srcq+16]
750
movu m2, [srcq+2]
751
movu m3, [srcq+18]
752
pmullw m0, filter_x_a
753
pmullw m2, filter_x_b
754
paddw m0, filter_rnd
755
pmullw m1, filter_x_a
756
pmullw m3, filter_x_b
757
paddw m1, filter_rnd
758
paddw m0, m2
759
paddw m1, m3
760
psrlw m0, 4
761
psrlw m1, 4
762
lea srcq, [srcq+src_strideq*2]
763
.x_other_y_half_loop:
764
movu m2, [srcq]
765
movu m3, [srcq+16]
766
movu m4, [srcq+2]
767
movu m5, [srcq+18]
768
pmullw m2, filter_x_a
769
pmullw m4, filter_x_b
770
paddw m2, filter_rnd
771
pmullw m3, filter_x_a
772
pmullw m5, filter_x_b
773
paddw m3, filter_rnd
774
paddw m2, m4
775
paddw m3, m5
776
mova m4, [refq]
777
mova m5, [refq+16]
778
psrlw m2, 4
779
psrlw m3, 4
780
pavgw m0, m2
781
pavgw m1, m3
782
%if %2 == 1 ; avg
783
pavgw m0, [second_predq]
784
pavgw m1, [second_predq+16]
785
%endif
786
SUM_SSE m0, m4, m1, m5, m6, m7
787
mova m0, m2
788
mova m1, m3
789
790
lea srcq, [srcq+src_strideq*2]
791
lea refq, [refq+ref_strideq*2]
792
%if %2 == 1 ; avg
793
add second_predq, second_str
794
%endif
795
%else ; %1 < 16
796
movu m0, [srcq]
797
movu m2, [srcq+2]
798
pmullw m0, filter_x_a
799
pmullw m2, filter_x_b
800
paddw m0, filter_rnd
801
paddw m0, m2
802
psrlw m0, 4
803
lea srcq, [srcq+src_strideq*2]
804
.x_other_y_half_loop:
805
movu m2, [srcq]
806
movu m3, [srcq+src_strideq*2]
807
movu m4, [srcq+2]
808
movu m5, [srcq+src_strideq*2+2]
809
pmullw m2, filter_x_a
810
pmullw m4, filter_x_b
811
paddw m2, filter_rnd
812
pmullw m3, filter_x_a
813
pmullw m5, filter_x_b
814
paddw m3, filter_rnd
815
paddw m2, m4
816
paddw m3, m5
817
mova m4, [refq]
818
mova m5, [refq+ref_strideq*2]
819
psrlw m2, 4
820
psrlw m3, 4
821
pavgw m0, m2
822
pavgw m2, m3
823
%if %2 == 1 ; avg
824
pavgw m0, [second_predq]
825
add second_predq, second_str
826
pavgw m2, [second_predq]
827
%endif
828
SUM_SSE m0, m4, m2, m5, m6, m7
829
mova m0, m3
830
831
lea srcq, [srcq+src_strideq*4]
832
lea refq, [refq+ref_strideq*4]
833
%if %2 == 1 ; avg
834
add second_predq, second_str
835
%endif
836
%endif
837
dec block_height
838
jg .x_other_y_half_loop
839
%undef filter_x_a
840
%undef filter_x_b
841
%undef filter_rnd
842
STORE_AND_RET
843
844
.x_nonhalf_y_nonhalf:
845
; loading filter - this is same as in 8-bit depth
846
%if ARCH_X86_64
847
lea bilin_filter, [GLOBAL(bilin_filter_m)]
848
%endif
849
shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
850
shl y_offsetd, filter_idx_shift
851
%if ARCH_X86_64 && mmsize == 16
852
mova m8, [bilin_filter+x_offsetq]
853
mova m9, [bilin_filter+x_offsetq+16]
854
mova m10, [bilin_filter+y_offsetq]
855
mova m11, [bilin_filter+y_offsetq+16]
856
mova m12, [GLOBAL(pw_8)]
857
%define filter_x_a m8
858
%define filter_x_b m9
859
%define filter_y_a m10
860
%define filter_y_b m11
861
%define filter_rnd m12
862
%else ; x86-32
863
%if ARCH_X86=1 && CONFIG_PIC=1
864
; In this case, there is NO unused register. Used src_stride register. Later,
865
; src_stride has to be loaded from stack when it is needed.
866
%define tempq src_strideq
867
mov tempq, g_bilin_filterm
868
add x_offsetq, tempq
869
add y_offsetq, tempq
870
%define filter_x_a [x_offsetq]
871
%define filter_x_b [x_offsetq+16]
872
%define filter_y_a [y_offsetq]
873
%define filter_y_b [y_offsetq+16]
874
875
mov tempq, g_pw_8m
876
%define filter_rnd [tempq]
877
%else
878
add x_offsetq, bilin_filter
879
add y_offsetq, bilin_filter
880
%define filter_x_a [x_offsetq]
881
%define filter_x_b [x_offsetq+16]
882
%define filter_y_a [y_offsetq]
883
%define filter_y_b [y_offsetq+16]
884
%define filter_rnd [GLOBAL(pw_8)]
885
%endif
886
%endif
887
; end of load filter
888
889
; x_offset == bilin interpolation && y_offset == bilin interpolation
890
%if %1 == 16
891
movu m0, [srcq]
892
movu m2, [srcq+2]
893
movu m1, [srcq+16]
894
movu m3, [srcq+18]
895
pmullw m0, filter_x_a
896
pmullw m2, filter_x_b
897
paddw m0, filter_rnd
898
pmullw m1, filter_x_a
899
pmullw m3, filter_x_b
900
paddw m1, filter_rnd
901
paddw m0, m2
902
paddw m1, m3
903
psrlw m0, 4
904
psrlw m1, 4
905
906
INC_SRC_BY_SRC_STRIDE
907
908
.x_other_y_other_loop:
909
movu m2, [srcq]
910
movu m4, [srcq+2]
911
movu m3, [srcq+16]
912
movu m5, [srcq+18]
913
pmullw m2, filter_x_a
914
pmullw m4, filter_x_b
915
paddw m2, filter_rnd
916
pmullw m3, filter_x_a
917
pmullw m5, filter_x_b
918
paddw m3, filter_rnd
919
paddw m2, m4
920
paddw m3, m5
921
psrlw m2, 4
922
psrlw m3, 4
923
mova m4, m2
924
mova m5, m3
925
pmullw m0, filter_y_a
926
pmullw m2, filter_y_b
927
paddw m0, filter_rnd
928
pmullw m1, filter_y_a
929
pmullw m3, filter_y_b
930
paddw m0, m2
931
paddw m1, filter_rnd
932
mova m2, [refq]
933
paddw m1, m3
934
psrlw m0, 4
935
psrlw m1, 4
936
mova m3, [refq+16]
937
%if %2 == 1 ; avg
938
pavgw m0, [second_predq]
939
pavgw m1, [second_predq+16]
940
%endif
941
SUM_SSE m0, m2, m1, m3, m6, m7
942
mova m0, m4
943
mova m1, m5
944
945
INC_SRC_BY_SRC_STRIDE
946
lea refq, [refq + ref_strideq * 2]
947
%if %2 == 1 ; avg
948
add second_predq, second_str
949
%endif
950
%else ; %1 < 16
951
movu m0, [srcq]
952
movu m2, [srcq+2]
953
pmullw m0, filter_x_a
954
pmullw m2, filter_x_b
955
paddw m0, filter_rnd
956
paddw m0, m2
957
psrlw m0, 4
958
959
INC_SRC_BY_SRC_STRIDE
960
961
.x_other_y_other_loop:
962
movu m2, [srcq]
963
movu m4, [srcq+2]
964
INC_SRC_BY_SRC_STRIDE
965
movu m3, [srcq]
966
movu m5, [srcq+2]
967
pmullw m2, filter_x_a
968
pmullw m4, filter_x_b
969
paddw m2, filter_rnd
970
pmullw m3, filter_x_a
971
pmullw m5, filter_x_b
972
paddw m3, filter_rnd
973
paddw m2, m4
974
paddw m3, m5
975
psrlw m2, 4
976
psrlw m3, 4
977
mova m4, m2
978
mova m5, m3
979
pmullw m0, filter_y_a
980
pmullw m2, filter_y_b
981
paddw m0, filter_rnd
982
pmullw m4, filter_y_a
983
pmullw m3, filter_y_b
984
paddw m0, m2
985
paddw m4, filter_rnd
986
mova m2, [refq]
987
paddw m4, m3
988
psrlw m0, 4
989
psrlw m4, 4
990
mova m3, [refq+ref_strideq*2]
991
%if %2 == 1 ; avg
992
pavgw m0, [second_predq]
993
add second_predq, second_str
994
pavgw m4, [second_predq]
995
%endif
996
SUM_SSE m0, m2, m4, m3, m6, m7
997
mova m0, m5
998
999
INC_SRC_BY_SRC_STRIDE
1000
lea refq, [refq + ref_strideq * 4]
1001
%if %2 == 1 ; avg
1002
add second_predq, second_str
1003
%endif
1004
%endif
1005
dec block_height
1006
jg .x_other_y_other_loop
1007
%undef filter_x_a
1008
%undef filter_x_b
1009
%undef filter_y_a
1010
%undef filter_y_b
1011
%undef filter_rnd
1012
STORE_AND_RET
1013
%endmacro
1014
1015
INIT_XMM sse2
1016
SUBPEL_VARIANCE 8
1017
SUBPEL_VARIANCE 16
1018
1019
INIT_XMM sse2
1020
SUBPEL_VARIANCE 8, 1
1021
SUBPEL_VARIANCE 16, 1
1022
libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm - master | Gitverse (2024)
Top Articles
RIU VistaMar in Puerto Rico, Spanje | Zoover
Wiper Blades Near Me at 10 Nashua Rd in Londonderry, NH
Wiper Blades Near Me at 2036 S Hastings Way in Eau Claire, WI
Latest Posts
Bridging Viewpoints in News with Recommender Systems | Proceedings of the 18th ACM Conference on Recommender Systems
Grassland-based products: quality and authentication
Recommended Articles
- Forbidden Kiss Capitulo 17
- Jon Bernthal Elena Tronina
- wltx.com – SUICIDE BOMBERS MAGAZINE
- Black (2024) Costume
- 'I was raped by my dad and brother - one piece of evidence brought me justice'
- The Hero's Savior Volume 14
- Drea Garcia Cannes 2022
- Grand Gardens Memorabilia
- Tenshi No Shippo 2 Reading
- Weight Of 白石ひとみ
- When Is Kino No Tabi: The Beautiful World Next Episode
- Sue Cat Recap
- Lucky Strike 13.Rész
- Top 40 + Nail Salon Equipment List. What Tools and Supplies are Must Haves – The Nail Tech Diaries
- Wie spielt man Dislyte auf PC oder Mac?
- Welcome to V-Premium & Premium Format! | Cardfight!! Vanguard Trading Card Game | Official Website
- Angel Has Fallen (2019) Cast and Crew
- Razor Shark 100 slot sites with Jekyll and Hyde percent free Play in the Demonstration Form & Comment - Pinspiration
- Magnifying Lamps purchased inexpensively from Mercateo
- Beste make up remover 2024: Top 10 beste make up removers
- Close Quarters Conflict June Update
- Soundbar Surround Sets Test: Die 21 besten Heimkino-Pakete im Vergleich
- Eye Masks - Skin Care | Ulta Beauty
- The Most Anticipated Movies of 2024
- Purgatory (1999) Blu Ray
- How virtual credit cards work for businesses | Taulia
- Vanitas No Karte Where Does The Anime End
- Mr Ajikko Eng Dub Watch Online
- Kaori No Keishou Season 3 Episode 19
- Jimseung Goodreads
- Top 15 Budget Brunches In London - BrokeinLondon
- Boston Public Library
- Mai Kitajima Now 2022
- All Lyrica Okano Movies List
- When Should You Throw Out Makeup Products? - The List
- Klondike: The Lost Expedition Fanart
- Cast Of Crouching Tiger, Hidden Dragon Film
- Mastering Ctrl Alt Del in Remote Desktop Sessions: Step-by-step
- Discount Code Insatiable Creatures Ps4
- Pokemon (2019): Planetarium Numbers
- Linkage Cloak
- Super Solvers: Outnumbered! Egg
- shiratorizawa antics - Chapter 27 - mooshys
- How To Watch Ipartment
- Reel Deal Slots: Nickel Alley Micro Stuttering
- Monkey King: Hero Is Back (2015) Trailer
- Billy Boyd Convention
- First Kiss Iq
- Cardiac rehabilitation in Germany
- Hakugei: Legend Of The Moby Dick 4K Theaters
- Cody Clarke Adopted Daughter
- Ju-on: The Final Curse (2015)
- How Many Chapters Are In Toca Race Driver
- Paul Di'anno X Rated
- Usonatsu: The Summer Romance Bloomed From A Lie
- Uchuu Senkan Tiramisù Ii Games List
- Suite Precure The Movie: Take It Back! The Miraculous Melody That Connects Hearts 83
- Wotakoi: Love Is Hard For Otaku Rat
- Exterminate All The Brutes Movie Awards
- Does Netflix Have 오픈 더 도어 (2023)
- Stephanie Sanditz 2022 Oscars
- Meng Hao, Ji Ning, Li Qiye & Old Thief Heaven Vs. Featherine - Battles - Comic Vine
- Lol: Last One Laughing Sweden Movie Trailer
- Lautaro Martínez: con la humildad de un grande, le dio el premio a Dibu, siempre juega en equipo y... sigue en la cúspide
- Iscador Qu 0,01 mg - Wirkung, Nebenwirkungen, Dosierung
- Base Coat und Top Coat für Ihre Nägel
- Benoît Jacquot Short Dress
- The new Iron Man #1 explained - Everything you need to know about Tony Stark's new armor, his new status quo, and the return of Iron Monger
- Rei Then
- Abstract Tattoo Designs - 150 Printable Designs & Ideas
- Karuho Shiina manga: Kimi ni Todoke vol.30 Special Edition form JP • £32.69
- Otherworldly Maidens: Monster Girls From Another World Avatar
- Photo to Sketch | Free Image to Sketch Converter Online - Media.io
- Bokushinu Movie Release
- New programmes announced for BBC Scotland TV channel
- Michelle Bauer Pictures Now
- Baixar BOKU BOKU para PC - LDPlayer
- What Year Was Dance Academy Released
- Bruno Rossi September 2023
- Carlos Kaimakamian Carrau Imdb Bio
- Kuno Chiyo The Precog Cello
- Optometry Equipment Market Is Booming Worldwide 2024-2031 | Canon Inc., Carl Zeiss AG, Escalon Services, Inc., Essilor International S.A., Haag-Streit AG
- What Happens At The End Of Love Clinic (2015)
- Auckland's Mill Road inclusion in government’s fast track list welcomed
- 10 Best Natural & Organic Shaving Creams In 2024 | Men's Care
- Gabrielle Union Nj
- Green extraction of natural products: theory and practice
- Rising Impact Pantip
- The Best Beard Trimmers Will Keep Your Scruff Shaped Up
- Age Rating Watashi No Touchika
- [Guida ai Trofei] Dragon's Dogma 2
- When Does The Next Season Of Comic Party Revolution (2005) Come Out
- How Old Is Carlos Navarro 2021
- Descargar Manga Life: Senjou No Bokura Pdf
- Where to go in Dunhuang-List of Top Dunhuang Attractions
- Yoake No Uta Manga Wiki
- Dreamcatcher 2024 World Tour [Luck Inside 7 Doors] in Europe Meet & Greet Event - MyMusicTaste
- The Gravel Ride. A cycling podcast: In the Dirt: Question and Answer Part 2
- What Did Itachi Say to Sasuke in Naruto: Shippuden Before He Died?
- ab colors stone metal steel ball chain
- creative
- tape swirl sliders manicure
- semi permanent uv gel for diy painting drawing manicure varnish
- for
- transfer painting stamper with scraper french
- rose flower
- plates tiger zebra leopard print animal plants image stainless steel stencil nail art
- snake leopard nail stamping plates english letter love heart leaves flowers design printing plates nails art stencil
- fingernail
Article information
Author: Delena Feil
Last Updated:
Views: 5846
Rating: 4.4 / 5 (45 voted)
Reviews: 92% of readers found this page helpful
Author information
Name: Delena Feil
Birthday: 1998-08-29
Address: 747 Lubowitz Run, Sidmouth, HI 90646-5543
Phone: +99513241752844
Job: Design Supervisor
Hobby: Digital arts, Lacemaking, Air sports, Running, Scouting, Shooting, Puzzles
Introduction: My name is Delena Feil, I am a clean, splendid, calm, fancy, jolly, bright, faithful person who loves writing and wants to share my knowledge and understanding with you.