libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm - master | Gitverse (2024)

1

;

2

; Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3

;

4

; Use of this source code is governed by a BSD-style license

5

; that can be found in the LICENSE file in the root of the source

6

; tree. An additional intellectual property rights grant can be found

7

; in the file PATENTS. All contributing project authors may

8

; be found in the AUTHORS file in the root of the source tree.

9

;

10

11

%include "third_party/x86inc/x86inc.asm"

12

13

SECTION_RODATA

14

pw_8: times 8 dw 8

15

bilin_filter_m_sse2: times 8 dw 16

16

 times 8 dw 0

17

 times 8 dw 14

18

 times 8 dw 2

19

 times 8 dw 12

20

 times 8 dw 4

21

 times 8 dw 10

22

 times 8 dw 6

23

 times 16 dw 8

24

 times 8 dw 6

25

 times 8 dw 10

26

 times 8 dw 4

27

 times 8 dw 12

28

 times 8 dw 2

29

 times 8 dw 14

30

31

SECTION .text

32

33

; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,

34

; int x_offset, int y_offset,

35

; const uint8_t *ref, ptrdiff_t ref_stride,

36

; int height, unsigned int *sse);

37

;

38

; This function returns the SE and stores SSE in the given pointer.

39

40

%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse

41

 psubw %3, %4

42

 psubw %1, %2

43

 mova %4, %3 ; make copies to manipulate to calc sum

44

 mova %2, %1 ; use originals for calc sse

45

 pmaddwd %3, %3

46

 paddw %4, %2

47

 pmaddwd %1, %1

48

 movhlps %2, %4

49

 paddd %6, %3

50

 paddw %4, %2

51

 pxor %2, %2

52

 pcmpgtw %2, %4 ; mask for 0 > %4 (sum)

53

 punpcklwd %4, %2 ; sign-extend word to dword

54

 paddd %6, %1

55

 paddd %5, %4

56

57

%endmacro

58

59

%macro STORE_AND_RET 0

60

%if mmsize == 16

61

 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit

62

 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.

63

 ; We have to sign-extend it before adding the words within the register

64

 ; and outputing to a dword.

65

 movhlps m3, m7

66

 movhlps m4, m6

67

 paddd m7, m3

68

 paddd m6, m4

69

 pshufd m3, m7, 0x1

70

 pshufd m4, m6, 0x1

71

 paddd m7, m3

72

 paddd m6, m4

73

 mov r1, ssem ; r1 = unsigned int *sse

74

 movd [r1], m7 ; store sse

75

 movd eax, m6 ; store sum as return value

76

%endif

77

 RET

78

%endmacro

79

80

%macro INC_SRC_BY_SRC_STRIDE 0

81

%if ARCH_X86=1 && CONFIG_PIC=1

82

 add srcq, src_stridemp

83

 add srcq, src_stridemp

84

%else

85

 lea srcq, [srcq + src_strideq*2]

86

%endif

87

%endmacro

88

89

%macro SUBPEL_VARIANCE 1-2 0 ; W

90

%define bilin_filter_m bilin_filter_m_sse2

91

%define filter_idx_shift 5

92

93

94

%if ARCH_X86_64

95

 %if %2 == 1 ; avg

96

 cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \

97

 x_offset, y_offset, \

98

 ref, ref_stride, \

99

 second_pred, second_stride, height, sse

100

 %define second_str second_strideq

101

 %else

102

 cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \

103

 x_offset, y_offset, \

104

 ref, ref_stride, height, sse

105

 %endif

106

 %define block_height heightd

107

 %define bilin_filter sseq

108

%else

109

 %if CONFIG_PIC=1

110

 %if %2 == 1 ; avg

111

 cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \

112

 x_offset, y_offset, \

113

 ref, ref_stride, \

114

 second_pred, second_stride, height, sse

115

 %define block_height dword heightm

116

 %define second_str second_stridemp

117

 %else

118

 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \

119

 x_offset, y_offset, \

120

 ref, ref_stride, height, sse

121

 %define block_height heightd

122

 %endif

123

124

 ; reuse argument stack space

125

 %define g_bilin_filterm x_offsetm

126

 %define g_pw_8m y_offsetm

127

128

 ; Store bilin_filter and pw_8 location in stack

129

 %if GET_GOT_DEFINED == 1

130

 GET_GOT eax

131

 add esp, 4 ; restore esp

132

 %endif

133

134

 lea ecx, [GLOBAL(bilin_filter_m)]

135

 mov g_bilin_filterm, ecx

136

137

 lea ecx, [GLOBAL(pw_8)]

138

 mov g_pw_8m, ecx

139

140

 LOAD_IF_USED 0, 1 ; load eax, ecx back

141

 %else

142

 %if %2 == 1 ; avg

143

 cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \

144

 x_offset, y_offset, \

145

 ref, ref_stride, \

146

 second_pred, second_stride, height, sse

147

 %define block_height dword heightm

148

 %define second_str second_stridemp

149

 %else

150

 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \

151

 x_offset, y_offset, \

152

 ref, ref_stride, height, sse

153

 %define block_height heightd

154

 %endif

155

156

 %define bilin_filter bilin_filter_m

157

 %endif

158

%endif

159

160

 ASSERT %1 <= 16 ; m6 overflows if w > 16

161

 pxor m6, m6 ; sum

162

 pxor m7, m7 ; sse

163

164

%if %1 < 16

165

 sar block_height, 1

166

%endif

167

%if %2 == 1 ; avg

168

 shl second_str, 1

169

%endif

170

171

 ; FIXME(rbultje) replace by jumptable?

172

 test x_offsetd, x_offsetd

173

 jnz .x_nonzero

174

 ; x_offset == 0

175

 test y_offsetd, y_offsetd

176

 jnz .x_zero_y_nonzero

177

178

 ; x_offset == 0 && y_offset == 0

179

.x_zero_y_zero_loop:

180

%if %1 == 16

181

 movu m0, [srcq]

182

 movu m2, [srcq + 16]

183

 mova m1, [refq]

184

 mova m3, [refq + 16]

185

%if %2 == 1 ; avg

186

 pavgw m0, [second_predq]

187

 pavgw m2, [second_predq+16]

188

%endif

189

 SUM_SSE m0, m1, m2, m3, m6, m7

190

191

 lea srcq, [srcq + src_strideq*2]

192

 lea refq, [refq + ref_strideq*2]

193

%if %2 == 1 ; avg

194

 add second_predq, second_str

195

%endif

196

%else ; %1 < 16

197

 movu m0, [srcq]

198

 movu m2, [srcq + src_strideq*2]

199

 mova m1, [refq]

200

 mova m3, [refq + ref_strideq*2]

201

%if %2 == 1 ; avg

202

 pavgw m0, [second_predq]

203

 add second_predq, second_str

204

 pavgw m2, [second_predq]

205

%endif

206

 SUM_SSE m0, m1, m2, m3, m6, m7

207

208

 lea srcq, [srcq + src_strideq*4]

209

 lea refq, [refq + ref_strideq*4]

210

%if %2 == 1 ; avg

211

 add second_predq, second_str

212

%endif

213

%endif

214

 dec block_height

215

 jg .x_zero_y_zero_loop

216

 STORE_AND_RET

217

218

.x_zero_y_nonzero:

219

 cmp y_offsetd, 8

220

 jne .x_zero_y_nonhalf

221

222

 ; x_offset == 0 && y_offset == 0.5

223

.x_zero_y_half_loop:

224

%if %1 == 16

225

 movu m0, [srcq]

226

 movu m1, [srcq+16]

227

 movu m4, [srcq+src_strideq*2]

228

 movu m5, [srcq+src_strideq*2+16]

229

 mova m2, [refq]

230

 mova m3, [refq+16]

231

 pavgw m0, m4

232

 pavgw m1, m5

233

%if %2 == 1 ; avg

234

 pavgw m0, [second_predq]

235

 pavgw m1, [second_predq+16]

236

%endif

237

 SUM_SSE m0, m2, m1, m3, m6, m7

238

239

 lea srcq, [srcq + src_strideq*2]

240

 lea refq, [refq + ref_strideq*2]

241

%if %2 == 1 ; avg

242

 add second_predq, second_str

243

%endif

244

%else ; %1 < 16

245

 movu m0, [srcq]

246

 movu m1, [srcq+src_strideq*2]

247

 movu m5, [srcq+src_strideq*4]

248

 mova m2, [refq]

249

 mova m3, [refq+ref_strideq*2]

250

 pavgw m0, m1

251

 pavgw m1, m5

252

%if %2 == 1 ; avg

253

 pavgw m0, [second_predq]

254

 add second_predq, second_str

256

%endif

257

 SUM_SSE m0, m2, m1, m3, m6, m7

258

259

 lea srcq, [srcq + src_strideq*4]

260

 lea refq, [refq + ref_strideq*4]

261

%if %2 == 1 ; avg

262

 add second_predq, second_str

263

%endif

264

%endif

265

 dec block_height

266

 jg .x_zero_y_half_loop

267

 STORE_AND_RET

268

269

.x_zero_y_nonhalf:

270

 ; x_offset == 0 && y_offset == bilin interpolation

271

%if ARCH_X86_64

272

 lea bilin_filter, [GLOBAL(bilin_filter_m)]

273

%endif

274

 shl y_offsetd, filter_idx_shift

275

%if ARCH_X86_64 && mmsize == 16

276

 mova m8, [bilin_filter+y_offsetq]

277

 mova m9, [bilin_filter+y_offsetq+16]

278

 mova m10, [GLOBAL(pw_8)]

279

%define filter_y_a m8

280

%define filter_y_b m9

281

%define filter_rnd m10

282

%else ; x86-32 or mmx

283

%if ARCH_X86=1 && CONFIG_PIC=1

284

; x_offset == 0, reuse x_offset reg

285

%define tempq x_offsetq

286

 add y_offsetq, g_bilin_filterm

287

%define filter_y_a [y_offsetq]

288

%define filter_y_b [y_offsetq+16]

289

 mov tempq, g_pw_8m

290

%define filter_rnd [tempq]

291

%else

292

 add y_offsetq, bilin_filter

293

%define filter_y_a [y_offsetq]

294

%define filter_y_b [y_offsetq+16]

295

%define filter_rnd [GLOBAL(pw_8)]

296

%endif

297

%endif

298

299

.x_zero_y_other_loop:

300

%if %1 == 16

301

 movu m0, [srcq]

302

 movu m1, [srcq + 16]

303

 movu m4, [srcq+src_strideq*2]

304

 movu m5, [srcq+src_strideq*2+16]

305

 mova m2, [refq]

306

 mova m3, [refq+16]

307

 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can

308

 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of

309

 ; instructions is the same (5), but it is 1 mul instead of 2, so might be

310

 ; slightly faster because of pmullw latency. It would also cut our rodata

311

 ; tables in half for this function, and save 1-2 registers on x86-64.

312

 pmullw m1, filter_y_a

313

 pmullw m5, filter_y_b

314

 paddw m1, filter_rnd

315

 pmullw m0, filter_y_a

316

 pmullw m4, filter_y_b

317

 paddw m0, filter_rnd

318

 paddw m1, m5

319

 paddw m0, m4

320

 psrlw m1, 4

321

 psrlw m0, 4

322

%if %2 == 1 ; avg

323

 pavgw m0, [second_predq]

324

 pavgw m1, [second_predq+16]

325

%endif

326

 SUM_SSE m0, m2, m1, m3, m6, m7

327

328

 lea srcq, [srcq + src_strideq*2]

329

 lea refq, [refq + ref_strideq*2]

330

%if %2 == 1 ; avg

331

 add second_predq, second_str

332

%endif

333

%else ; %1 < 16

334

 movu m0, [srcq]

335

 movu m1, [srcq+src_strideq*2]

336

 movu m5, [srcq+src_strideq*4]

337

 mova m4, m1

338

 mova m2, [refq]

339

 mova m3, [refq+ref_strideq*2]

340

 pmullw m1, filter_y_a

341

 pmullw m5, filter_y_b

342

 paddw m1, filter_rnd

343

 pmullw m0, filter_y_a

344

 pmullw m4, filter_y_b

345

 paddw m0, filter_rnd

346

 paddw m1, m5

347

 paddw m0, m4

348

 psrlw m1, 4

349

 psrlw m0, 4

350

%if %2 == 1 ; avg

351

 pavgw m0, [second_predq]

352

 add second_predq, second_str

353

 pavgw m1, [second_predq]

354

%endif

355

 SUM_SSE m0, m2, m1, m3, m6, m7

356

357

 lea srcq, [srcq + src_strideq*4]

358

 lea refq, [refq + ref_strideq*4]

359

%if %2 == 1 ; avg

360

 add second_predq, second_str

361

%endif

362

%endif

363

 dec block_height

364

 jg .x_zero_y_other_loop

365

%undef filter_y_a

366

%undef filter_y_b

367

%undef filter_rnd

368

 STORE_AND_RET

369

370

.x_nonzero:

371

 cmp x_offsetd, 8

372

 jne .x_nonhalf

373

 ; x_offset == 0.5

374

 test y_offsetd, y_offsetd

375

 jnz .x_half_y_nonzero

376

377

 ; x_offset == 0.5 && y_offset == 0

378

.x_half_y_zero_loop:

379

%if %1 == 16

380

 movu m0, [srcq]

381

 movu m1, [srcq + 16]

382

 movu m4, [srcq + 2]

383

 movu m5, [srcq + 18]

384

 mova m2, [refq]

385

 mova m3, [refq + 16]

386

 pavgw m0, m4

387

 pavgw m1, m5

388

%if %2 == 1 ; avg

389

 pavgw m0, [second_predq]

390

 pavgw m1, [second_predq+16]

391

%endif

392

 SUM_SSE m0, m2, m1, m3, m6, m7

393

394

 lea srcq, [srcq + src_strideq*2]

395

 lea refq, [refq + ref_strideq*2]

396

%if %2 == 1 ; avg

397

 add second_predq, second_str

398

%endif

399

%else ; %1 < 16

400

 movu m0, [srcq]

401

 movu m1, [srcq + src_strideq*2]

402

 movu m4, [srcq + 2]

403

 movu m5, [srcq + src_strideq*2 + 2]

404

 mova m2, [refq]

405

 mova m3, [refq + ref_strideq*2]

406

 pavgw m0, m4

407

 pavgw m1, m5

408

%if %2 == 1 ; avg

409

 pavgw m0, [second_predq]

410

 add second_predq, second_str

411

 pavgw m1, [second_predq]

412

%endif

413

 SUM_SSE m0, m2, m1, m3, m6, m7

414

415

 lea srcq, [srcq + src_strideq*4]

416

 lea refq, [refq + ref_strideq*4]

417

%if %2 == 1 ; avg

418

 add second_predq, second_str

419

%endif

420

%endif

421

 dec block_height

422

 jg .x_half_y_zero_loop

423

 STORE_AND_RET

424

425

.x_half_y_nonzero:

426

 cmp y_offsetd, 8

427

 jne .x_half_y_nonhalf

428

429

 ; x_offset == 0.5 && y_offset == 0.5

430

%if %1 == 16

431

 movu m0, [srcq]

432

 movu m1, [srcq+16]

433

 movu m2, [srcq+2]

434

 movu m3, [srcq+18]

435

 lea srcq, [srcq + src_strideq*2]

436

 pavgw m0, m2

437

 pavgw m1, m3

438

.x_half_y_half_loop:

439

 movu m2, [srcq]

440

 movu m3, [srcq + 16]

441

 movu m4, [srcq + 2]

442

 movu m5, [srcq + 18]

443

 pavgw m2, m4

444

 pavgw m3, m5

445

 pavgw m0, m2

446

 pavgw m1, m3

447

 mova m4, [refq]

448

 mova m5, [refq + 16]

449

%if %2 == 1 ; avg

450

 pavgw m0, [second_predq]

451

 pavgw m1, [second_predq+16]

452

%endif

453

 SUM_SSE m0, m4, m1, m5, m6, m7

454

 mova m0, m2

455

 mova m1, m3

456

457

 lea srcq, [srcq + src_strideq*2]

458

 lea refq, [refq + ref_strideq*2]

459

%if %2 == 1 ; avg

460

 add second_predq, second_str

461

%endif

462

%else ; %1 < 16

463

 movu m0, [srcq]

464

 movu m2, [srcq+2]

465

 lea srcq, [srcq + src_strideq*2]

466

 pavgw m0, m2

467

.x_half_y_half_loop:

468

 movu m2, [srcq]

469

 movu m3, [srcq + src_strideq*2]

470

 movu m4, [srcq + 2]

471

 movu m5, [srcq + src_strideq*2 + 2]

472

 pavgw m2, m4

473

 pavgw m3, m5

474

 pavgw m0, m2

475

 pavgw m2, m3

476

 mova m4, [refq]

477

 mova m5, [refq + ref_strideq*2]

478

%if %2 == 1 ; avg

479

 pavgw m0, [second_predq]

480

 add second_predq, second_str

481

 pavgw m2, [second_predq]

482

%endif

483

 SUM_SSE m0, m4, m2, m5, m6, m7

484

 mova m0, m3

485

486

 lea srcq, [srcq + src_strideq*4]

487

 lea refq, [refq + ref_strideq*4]

488

%if %2 == 1 ; avg

489

 add second_predq, second_str

490

%endif

491

%endif

492

 dec block_height

493

 jg .x_half_y_half_loop

494

 STORE_AND_RET

495

496

.x_half_y_nonhalf:

497

 ; x_offset == 0.5 && y_offset == bilin interpolation

498

%if ARCH_X86_64

499

 lea bilin_filter, [GLOBAL(bilin_filter_m)]

500

%endif

501

 shl y_offsetd, filter_idx_shift

502

%if ARCH_X86_64 && mmsize == 16

503

 mova m8, [bilin_filter+y_offsetq]

504

 mova m9, [bilin_filter+y_offsetq+16]

505

 mova m10, [GLOBAL(pw_8)]

506

%define filter_y_a m8

507

%define filter_y_b m9

508

%define filter_rnd m10

509

%else ; x86_32

510

%if ARCH_X86=1 && CONFIG_PIC=1

511

; x_offset == 0.5. We can reuse x_offset reg

512

%define tempq x_offsetq

513

 add y_offsetq, g_bilin_filterm

514

%define filter_y_a [y_offsetq]

515

%define filter_y_b [y_offsetq+16]

516

 mov tempq, g_pw_8m

517

%define filter_rnd [tempq]

518

%else

519

 add y_offsetq, bilin_filter

520

%define filter_y_a [y_offsetq]

521

%define filter_y_b [y_offsetq+16]

522

%define filter_rnd [GLOBAL(pw_8)]

523

%endif

524

%endif

525

526

%if %1 == 16

527

 movu m0, [srcq]

528

 movu m1, [srcq+16]

529

 movu m2, [srcq+2]

530

 movu m3, [srcq+18]

531

 lea srcq, [srcq + src_strideq*2]

532

 pavgw m0, m2

533

 pavgw m1, m3

534

.x_half_y_other_loop:

535

 movu m2, [srcq]

536

 movu m3, [srcq+16]

537

 movu m4, [srcq+2]

538

 movu m5, [srcq+18]

539

 pavgw m2, m4

540

 pavgw m3, m5

541

 mova m4, m2

542

 mova m5, m3

543

 pmullw m1, filter_y_a

544

 pmullw m3, filter_y_b

545

 paddw m1, filter_rnd

546

 paddw m1, m3

547

 pmullw m0, filter_y_a

548

 pmullw m2, filter_y_b

549

 paddw m0, filter_rnd

550

 psrlw m1, 4

551

 paddw m0, m2

552

 mova m2, [refq]

553

 psrlw m0, 4

554

 mova m3, [refq+16]

555

%if %2 == 1 ; avg

556

 pavgw m0, [second_predq]

557

 pavgw m1, [second_predq+16]

558

%endif

559

 SUM_SSE m0, m2, m1, m3, m6, m7

560

 mova m0, m4

561

 mova m1, m5

562

563

 lea srcq, [srcq + src_strideq*2]

564

 lea refq, [refq + ref_strideq*2]

565

%if %2 == 1 ; avg

566

 add second_predq, second_str

567

%endif

568

%else ; %1 < 16

569

 movu m0, [srcq]

570

 movu m2, [srcq+2]

571

 lea srcq, [srcq + src_strideq*2]

572

 pavgw m0, m2

573

.x_half_y_other_loop:

574

 movu m2, [srcq]

575

 movu m3, [srcq+src_strideq*2]

576

 movu m4, [srcq+2]

577

 movu m5, [srcq+src_strideq*2+2]

578

 pavgw m2, m4

579

 pavgw m3, m5

580

 mova m4, m2

581

 mova m5, m3

582

 pmullw m4, filter_y_a

583

 pmullw m3, filter_y_b

584

 paddw m4, filter_rnd

585

 paddw m4, m3

586

 pmullw m0, filter_y_a

587

 pmullw m2, filter_y_b

588

 paddw m0, filter_rnd

589

 psrlw m4, 4

590

 paddw m0, m2

591

 mova m2, [refq]

592

 psrlw m0, 4

593

 mova m3, [refq+ref_strideq*2]

594

%if %2 == 1 ; avg

595

 pavgw m0, [second_predq]

596

 add second_predq, second_str

597

 pavgw m4, [second_predq]

598

%endif

599

 SUM_SSE m0, m2, m4, m3, m6, m7

600

 mova m0, m5

601

602

 lea srcq, [srcq + src_strideq*4]

603

 lea refq, [refq + ref_strideq*4]

604

%if %2 == 1 ; avg

605

 add second_predq, second_str

606

%endif

607

%endif

608

 dec block_height

609

 jg .x_half_y_other_loop

610

%undef filter_y_a

611

%undef filter_y_b

612

%undef filter_rnd

613

 STORE_AND_RET

614

615

.x_nonhalf:

616

 test y_offsetd, y_offsetd

617

 jnz .x_nonhalf_y_nonzero

618

619

 ; x_offset == bilin interpolation && y_offset == 0

620

%if ARCH_X86_64

621

 lea bilin_filter, [GLOBAL(bilin_filter_m)]

622

%endif

623

 shl x_offsetd, filter_idx_shift

624

%if ARCH_X86_64 && mmsize == 16

625

 mova m8, [bilin_filter+x_offsetq]

626

 mova m9, [bilin_filter+x_offsetq+16]

627

 mova m10, [GLOBAL(pw_8)]

628

%define filter_x_a m8

629

%define filter_x_b m9

630

%define filter_rnd m10

631

%else ; x86-32

632

%if ARCH_X86=1 && CONFIG_PIC=1

633

; y_offset == 0. We can reuse y_offset reg.

634

%define tempq y_offsetq

635

 add x_offsetq, g_bilin_filterm

636

%define filter_x_a [x_offsetq]

637

%define filter_x_b [x_offsetq+16]

638

 mov tempq, g_pw_8m

639

%define filter_rnd [tempq]

640

%else

641

 add x_offsetq, bilin_filter

642

%define filter_x_a [x_offsetq]

643

%define filter_x_b [x_offsetq+16]

644

%define filter_rnd [GLOBAL(pw_8)]

645

%endif

646

%endif

647

648

.x_other_y_zero_loop:

649

%if %1 == 16

650

 movu m0, [srcq]

651

 movu m1, [srcq+16]

652

 movu m2, [srcq+2]

653

 movu m3, [srcq+18]

654

 mova m4, [refq]

655

 mova m5, [refq+16]

656

 pmullw m1, filter_x_a

657

 pmullw m3, filter_x_b

658

 paddw m1, filter_rnd

659

 pmullw m0, filter_x_a

660

 pmullw m2, filter_x_b

661

 paddw m0, filter_rnd

662

 paddw m1, m3

663

 paddw m0, m2

664

 psrlw m1, 4

665

 psrlw m0, 4

666

%if %2 == 1 ; avg

667

 pavgw m0, [second_predq]

668

 pavgw m1, [second_predq+16]

669

%endif

670

 SUM_SSE m0, m4, m1, m5, m6, m7

671

672

 lea srcq, [srcq+src_strideq*2]

673

 lea refq, [refq+ref_strideq*2]

674

%if %2 == 1 ; avg

675

 add second_predq, second_str

676

%endif

677

%else ; %1 < 16

678

 movu m0, [srcq]

679

 movu m1, [srcq+src_strideq*2]

680

 movu m2, [srcq+2]

681

 movu m3, [srcq+src_strideq*2+2]

682

 mova m4, [refq]

683

 mova m5, [refq+ref_strideq*2]

684

 pmullw m1, filter_x_a

685

 pmullw m3, filter_x_b

686

 paddw m1, filter_rnd

687

 pmullw m0, filter_x_a

688

 pmullw m2, filter_x_b

689

 paddw m0, filter_rnd

690

 paddw m1, m3

691

 paddw m0, m2

692

 psrlw m1, 4

693

 psrlw m0, 4

694

%if %2 == 1 ; avg

695

 pavgw m0, [second_predq]

696

 add second_predq, second_str

697

 pavgw m1, [second_predq]

698

%endif

699

 SUM_SSE m0, m4, m1, m5, m6, m7

700

701

 lea srcq, [srcq+src_strideq*4]

702

 lea refq, [refq+ref_strideq*4]

703

%if %2 == 1 ; avg

704

 add second_predq, second_str

705

%endif

706

%endif

707

 dec block_height

708

 jg .x_other_y_zero_loop

709

%undef filter_x_a

710

%undef filter_x_b

711

%undef filter_rnd

712

 STORE_AND_RET

713

714

.x_nonhalf_y_nonzero:

715

 cmp y_offsetd, 8

716

 jne .x_nonhalf_y_nonhalf

717

718

 ; x_offset == bilin interpolation && y_offset == 0.5

719

%if ARCH_X86_64

720

 lea bilin_filter, [GLOBAL(bilin_filter_m)]

721

%endif

722

 shl x_offsetd, filter_idx_shift

723

%if ARCH_X86_64 && mmsize == 16

724

 mova m8, [bilin_filter+x_offsetq]

725

 mova m9, [bilin_filter+x_offsetq+16]

726

 mova m10, [GLOBAL(pw_8)]

727

%define filter_x_a m8

728

%define filter_x_b m9

729

%define filter_rnd m10

730

%else ; x86-32

731

%if ARCH_X86=1 && CONFIG_PIC=1

732

; y_offset == 0.5. We can reuse y_offset reg.

733

%define tempq y_offsetq

734

 add x_offsetq, g_bilin_filterm

735

%define filter_x_a [x_offsetq]

736

%define filter_x_b [x_offsetq+16]

737

 mov tempq, g_pw_8m

738

%define filter_rnd [tempq]

739

%else

740

 add x_offsetq, bilin_filter

741

%define filter_x_a [x_offsetq]

742

%define filter_x_b [x_offsetq+16]

743

%define filter_rnd [GLOBAL(pw_8)]

744

%endif

745

%endif

746

747

%if %1 == 16

748

 movu m0, [srcq]

749

 movu m1, [srcq+16]

750

 movu m2, [srcq+2]

751

 movu m3, [srcq+18]

752

 pmullw m0, filter_x_a

753

 pmullw m2, filter_x_b

754

 paddw m0, filter_rnd

755

 pmullw m1, filter_x_a

756

 pmullw m3, filter_x_b

757

 paddw m1, filter_rnd

758

 paddw m0, m2

759

 paddw m1, m3

760

 psrlw m0, 4

761

 psrlw m1, 4

762

 lea srcq, [srcq+src_strideq*2]

763

.x_other_y_half_loop:

764

 movu m2, [srcq]

765

 movu m3, [srcq+16]

766

 movu m4, [srcq+2]

767

 movu m5, [srcq+18]

768

 pmullw m2, filter_x_a

769

 pmullw m4, filter_x_b

770

 paddw m2, filter_rnd

771

 pmullw m3, filter_x_a

772

 pmullw m5, filter_x_b

773

 paddw m3, filter_rnd

774

 paddw m2, m4

775

 paddw m3, m5

776

 mova m4, [refq]

777

 mova m5, [refq+16]

778

 psrlw m2, 4

779

 psrlw m3, 4

780

 pavgw m0, m2

781

 pavgw m1, m3

782

%if %2 == 1 ; avg

783

 pavgw m0, [second_predq]

784

 pavgw m1, [second_predq+16]

785

%endif

786

 SUM_SSE m0, m4, m1, m5, m6, m7

787

 mova m0, m2

788

 mova m1, m3

789

790

 lea srcq, [srcq+src_strideq*2]

791

 lea refq, [refq+ref_strideq*2]

792

%if %2 == 1 ; avg

793

 add second_predq, second_str

794

%endif

795

%else ; %1 < 16

796

 movu m0, [srcq]

797

 movu m2, [srcq+2]

798

 pmullw m0, filter_x_a

799

 pmullw m2, filter_x_b

800

 paddw m0, filter_rnd

801

 paddw m0, m2

802

 psrlw m0, 4

803

 lea srcq, [srcq+src_strideq*2]

804

.x_other_y_half_loop:

805

 movu m2, [srcq]

806

 movu m3, [srcq+src_strideq*2]

807

 movu m4, [srcq+2]

808

 movu m5, [srcq+src_strideq*2+2]

809

 pmullw m2, filter_x_a

810

 pmullw m4, filter_x_b

811

 paddw m2, filter_rnd

812

 pmullw m3, filter_x_a

813

 pmullw m5, filter_x_b

814

 paddw m3, filter_rnd

815

 paddw m2, m4

816

 paddw m3, m5

817

 mova m4, [refq]

818

 mova m5, [refq+ref_strideq*2]

819

 psrlw m2, 4

820

 psrlw m3, 4

821

 pavgw m0, m2

822

 pavgw m2, m3

823

%if %2 == 1 ; avg

824

 pavgw m0, [second_predq]

825

 add second_predq, second_str

826

 pavgw m2, [second_predq]

827

%endif

828

 SUM_SSE m0, m4, m2, m5, m6, m7

829

 mova m0, m3

830

831

 lea srcq, [srcq+src_strideq*4]

832

 lea refq, [refq+ref_strideq*4]

833

%if %2 == 1 ; avg

834

 add second_predq, second_str

835

%endif

836

%endif

837

 dec block_height

838

 jg .x_other_y_half_loop

839

%undef filter_x_a

840

%undef filter_x_b

841

%undef filter_rnd

842

 STORE_AND_RET

843

844

.x_nonhalf_y_nonhalf:

845

; loading filter - this is same as in 8-bit depth

846

%if ARCH_X86_64

847

 lea bilin_filter, [GLOBAL(bilin_filter_m)]

848

%endif

849

 shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5

850

 shl y_offsetd, filter_idx_shift

851

%if ARCH_X86_64 && mmsize == 16

852

 mova m8, [bilin_filter+x_offsetq]

853

 mova m9, [bilin_filter+x_offsetq+16]

854

 mova m10, [bilin_filter+y_offsetq]

855

 mova m11, [bilin_filter+y_offsetq+16]

856

 mova m12, [GLOBAL(pw_8)]

857

%define filter_x_a m8

858

%define filter_x_b m9

859

%define filter_y_a m10

860

%define filter_y_b m11

861

%define filter_rnd m12

862

%else ; x86-32

863

%if ARCH_X86=1 && CONFIG_PIC=1

864

; In this case, there is NO unused register. Used src_stride register. Later,

865

; src_stride has to be loaded from stack when it is needed.

866

%define tempq src_strideq

867

 mov tempq, g_bilin_filterm

868

 add x_offsetq, tempq

869

 add y_offsetq, tempq

870

%define filter_x_a [x_offsetq]

871

%define filter_x_b [x_offsetq+16]

872

%define filter_y_a [y_offsetq]

873

%define filter_y_b [y_offsetq+16]

874

875

 mov tempq, g_pw_8m

876

%define filter_rnd [tempq]

877

%else

878

 add x_offsetq, bilin_filter

879

 add y_offsetq, bilin_filter

880

%define filter_x_a [x_offsetq]

881

%define filter_x_b [x_offsetq+16]

882

%define filter_y_a [y_offsetq]

883

%define filter_y_b [y_offsetq+16]

884

%define filter_rnd [GLOBAL(pw_8)]

885

%endif

886

%endif

887

; end of load filter

888

889

 ; x_offset == bilin interpolation && y_offset == bilin interpolation

890

%if %1 == 16

891

 movu m0, [srcq]

892

 movu m2, [srcq+2]

893

 movu m1, [srcq+16]

894

 movu m3, [srcq+18]

895

 pmullw m0, filter_x_a

896

 pmullw m2, filter_x_b

897

 paddw m0, filter_rnd

898

 pmullw m1, filter_x_a

899

 pmullw m3, filter_x_b

900

 paddw m1, filter_rnd

901

 paddw m0, m2

902

 paddw m1, m3

903

 psrlw m0, 4

904

 psrlw m1, 4

905

906

 INC_SRC_BY_SRC_STRIDE

907

908

.x_other_y_other_loop:

909

 movu m2, [srcq]

910

 movu m4, [srcq+2]

911

 movu m3, [srcq+16]

912

 movu m5, [srcq+18]

913

 pmullw m2, filter_x_a

914

 pmullw m4, filter_x_b

915

 paddw m2, filter_rnd

916

 pmullw m3, filter_x_a

917

 pmullw m5, filter_x_b

918

 paddw m3, filter_rnd

919

 paddw m2, m4

920

 paddw m3, m5

921

 psrlw m2, 4

922

 psrlw m3, 4

923

 mova m4, m2

924

 mova m5, m3

925

 pmullw m0, filter_y_a

926

 pmullw m2, filter_y_b

927

 paddw m0, filter_rnd

928

 pmullw m1, filter_y_a

929

 pmullw m3, filter_y_b

930

 paddw m0, m2

931

 paddw m1, filter_rnd

932

 mova m2, [refq]

933

 paddw m1, m3

934

 psrlw m0, 4

935

 psrlw m1, 4

936

 mova m3, [refq+16]

937

%if %2 == 1 ; avg

938

 pavgw m0, [second_predq]

939

 pavgw m1, [second_predq+16]

940

%endif

941

 SUM_SSE m0, m2, m1, m3, m6, m7

942

 mova m0, m4

943

 mova m1, m5

944

945

 INC_SRC_BY_SRC_STRIDE

946

 lea refq, [refq + ref_strideq * 2]

947

%if %2 == 1 ; avg

948

 add second_predq, second_str

949

%endif

950

%else ; %1 < 16

951

 movu m0, [srcq]

952

 movu m2, [srcq+2]

953

 pmullw m0, filter_x_a

954

 pmullw m2, filter_x_b

955

 paddw m0, filter_rnd

956

 paddw m0, m2

957

 psrlw m0, 4

958

959

 INC_SRC_BY_SRC_STRIDE

960

961

.x_other_y_other_loop:

962

 movu m2, [srcq]

963

 movu m4, [srcq+2]

964

 INC_SRC_BY_SRC_STRIDE

965

 movu m3, [srcq]

966

 movu m5, [srcq+2]

967

 pmullw m2, filter_x_a

968

 pmullw m4, filter_x_b

969

 paddw m2, filter_rnd

970

 pmullw m3, filter_x_a

971

 pmullw m5, filter_x_b

972

 paddw m3, filter_rnd

973

 paddw m2, m4

974

 paddw m3, m5

975

 psrlw m2, 4

976

 psrlw m3, 4

977

 mova m4, m2

978

 mova m5, m3

979

 pmullw m0, filter_y_a

980

 pmullw m2, filter_y_b

981

 paddw m0, filter_rnd

982

 pmullw m4, filter_y_a

983

 pmullw m3, filter_y_b

984

 paddw m0, m2

985

 paddw m4, filter_rnd

986

 mova m2, [refq]

987

 paddw m4, m3

988

 psrlw m0, 4

989

 psrlw m4, 4

990

 mova m3, [refq+ref_strideq*2]

991

%if %2 == 1 ; avg

992

 pavgw m0, [second_predq]

993

 add second_predq, second_str

994

 pavgw m4, [second_predq]

995

%endif

996

 SUM_SSE m0, m2, m4, m3, m6, m7

997

 mova m0, m5

998

999

 INC_SRC_BY_SRC_STRIDE

1000

 lea refq, [refq + ref_strideq * 4]

1001

%if %2 == 1 ; avg

1002

 add second_predq, second_str

1003

%endif

1004

%endif

1005

 dec block_height

1006

 jg .x_other_y_other_loop

1007

%undef filter_x_a

1008

%undef filter_x_b

1009

%undef filter_y_a

1010

%undef filter_y_b

1011

%undef filter_rnd

1012

 STORE_AND_RET

1013

%endmacro

1014

1015

INIT_XMM sse2

1016

SUBPEL_VARIANCE 8

1017

SUBPEL_VARIANCE 16

1018

1019

INIT_XMM sse2

1020

SUBPEL_VARIANCE 8, 1

1021

SUBPEL_VARIANCE 16, 1

1022

libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm - master | Gitverse (2024)
Top Articles
Latest Posts
Recommended Articles
Article information

Author: Delena Feil

Last Updated:

Views: 5846

Rating: 4.4 / 5 (45 voted)

Reviews: 92% of readers found this page helpful

Author information

Name: Delena Feil

Birthday: 1998-08-29

Address: 747 Lubowitz Run, Sidmouth, HI 90646-5543

Phone: +99513241752844

Job: Design Supervisor

Hobby: Digital arts, Lacemaking, Air sports, Running, Scouting, Shooting, Puzzles

Introduction: My name is Delena Feil, I am a clean, splendid, calm, fancy, jolly, bright, faithful person who loves writing and wants to share my knowledge and understanding with you.