Skip to content

Commit 93771d6

Browse files
committed
Formatting and ChangeLog additions for previous commits.
1 parent c37c087 commit 93771d6

File tree

3 files changed

+181
-98
lines changed

3 files changed

+181
-98
lines changed

ChangeLog

+65-15
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,88 @@
1+
2021-07-15 Ben Wagner <bungeman@chromium.org>
2+
3+
* src/smooth/ftgrays.c: Guard inclusion of `emmintrin.h`.
4+
5+
Guard inclusion of `emmintrin.h` with `#ifdef __SSE2__`. The gcc
6+
version of this header, `xmmintrin.h`, and `mmintrin.h` check that
7+
the appropriate defines are set before defining anything (are
8+
internally guarded). However, the clang versions of these includes
9+
are not internally guarded. As a result of this, externally guard
10+
the inclusion of these headers.
11+
112
2021-07-15 David Turner <david@freetype.org>
213

3-
[smooth] Implement Bezier quadratic arc flattenning with DDA
14+
[smooth] Implement Bézier quadratic arc flattening with DDA.
415

516
Benchmarking shows that this provides a very slighty performance
6-
boost when rendering fonts with lots of quadratic bezier arcs,
17+
boost when rendering fonts with lots of quadratic Bézier arcs,
718
compared to the recursive arc splitting, but only when SSE2 is
819
available, or on 64-bit CPUs.
920

21+
On a 2017 Core i5-7300U CPU on Linux/x86_64:
22+
23+
ftbench -p -s10 -t5 -cb DroidSansFallbackFull.ttf
24+
25+
Before: 4.033 us/op (best of 5 runs for all numbers)
26+
After: 3.876 us/op
27+
28+
ftbench -p -s60 -t5 -cb DroidSansFallbackFull.ttf
29+
30+
Before: 13.467 us/op
31+
After: 13.385 us/op
32+
1033
* src/smooth/ftgrays.c (gray_render_conic): New implementation
1134
based on DDA and optionally SSE2.
1235

1336
2021-07-15 David Turner <david@freetype.org>
1437

15-
[smooth] Minor speedup to smooth rasterizer
38+
[smooth] Minor speedup to smooth rasterizer.
1639

17-
This speeds up the smooth rasterizer by avoiding a conditional
40+
This speeds up the smooth rasterizer by avoiding conditional
1841
branches in the hot path.
1942

20-
* src/smooth/ftgrays.c: Define a null cell used to both as a
21-
sentinel for all linked-lists, and to accumulate coverage and
22-
area values for "out-of-bounds" cell positions without a
23-
conditional check.
43+
- Define a fixed 'null cell', which will be pointed to whenever the
44+
current cell is outside of the current target region. This avoids
45+
a `ras.cell != NULL` check in the `FT_INTEGRATE` macro.
46+
47+
- Also use the null cell as a sentinel at the end of all `ycells`
48+
linked-lists, by setting its x coordinate to `INT_MAX`. This
49+
avoids a `if (!cell)` check in `gray_set_cell` as well.
50+
51+
- Slightly change the worker struct fields to perform a little less
52+
operations during rendering.
53+
54+
Example results (on a 2013 Corei5-3337U CPU)
55+
56+
out/ftbench -p -s10 -t5 -bc DroidSansFallbackFull.ttf
57+
58+
Before: 5.472 us/op
59+
After: 5.275 us/op
60+
61+
out/ftbench -p -s60 -t5 -bc DroidSansFallbackFull.ttf
62+
63+
Before: 17.988 us/op
64+
After: 17.389 us/op
65+
66+
* src/smooth/ftgrays.c (grat_TWorker): Replace `num_cells` field with
67+
`cell_free` and `cell_limit`.
68+
(NULL_CELL_PTR, CELL_MAX_X_VALUE, CELL_IS_NULL): New macros.
69+
(gray_dump_cells, gray_set_cell, gray_sweep, gray_sweep_direct,
70+
gray_convert_glyph_inner, gray_convert_glyph): Updated.
2471

2572
2021-07-15 David Turner <david@freetype.org>
2673

27-
Replaces download-test-fonts.sh with download-test-fonts.py which
28-
does the same work, and also avoids downloading anything if the
29-
files are already installed with the right content.
74+
[tests] Rewrite download script in Python3.
75+
76+
This commit replaces the bash script with a Python script that does
77+
the same work, plus avoiding to download anything if the files are
78+
already installed with the right content.
3079

31-
Now uses the first 8 byte of each file's sha256 hash for the digest.
80+
We now use the first 8 bytes of each file's sha256 hash for the
81+
digest.
3282

33-
* tests/scripts/download-test-fonts.sh: Removed
34-
* tests/scripts/download-test-fonts.py: New script
35-
* tests/README.md: Updated
83+
* tests/scripts/download-test-fonts.sh: Removed.
84+
* tests/scripts/download-test-fonts.py: New script.
85+
* tests/README.md: Updated.
3686

3787
2021-07-15 Alex Richardson <Alexander.Richardson@cl.cam.ac.uk>
3888

src/smooth/ftgrays.c

+81-57
Original file line numberDiff line numberDiff line change
@@ -487,8 +487,8 @@ typedef ptrdiff_t FT_PtrDist;
487487
PCell cell_free; /* call allocation next free slot */
488488
PCell cell_limit; /* cell allocation limit */
489489

490-
PCell* ycells; /* array of cell linked-lists, one per */
491-
/* vertical coordinate in the current band. */
490+
PCell* ycells; /* array of cell linked-lists; one per */
491+
/* vertical coordinate in the current band */
492492

493493
PCell cells; /* cell storage area */
494494
FT_PtrDist max_cells; /* cell storage capacity */
@@ -513,19 +513,21 @@ typedef ptrdiff_t FT_PtrDist;
513513
static gray_TWorker ras;
514514
#endif
515515

516-
/* Return a pointer to the "null cell", used as a sentinel at the end */
517-
/* of all ycells[] linked lists. Its x coordinate should be maximal */
518-
/* to ensure no NULL checks are necessary when looking for an insertion */
519-
/* point in gray_set_cell(). Other loops should check the cell pointer */
520-
/* with CELL_IS_NULL() to detect the end of the list. */
521-
#define NULL_CELL_PTR(ras) (ras).cells
516+
/*
517+
* Return a pointer to the 'null cell', used as a sentinel at the end of
518+
* all `ycells` linked lists. Its x coordinate should be maximal to
519+
* ensure no NULL checks are necessary when looking for an insertion point
520+
* in `gray_set_cell`. Other loops should check the cell pointer with
521+
* CELL_IS_NULL() to detect the end of the list.
522+
*/
523+
#define NULL_CELL_PTR( ras ) (ras).cells
522524

523-
/* The |x| value of the null cell. Must be the largest possible */
524-
/* integer value stored in a TCell.x field. */
525+
/* The |x| value of the null cell. Must be the largest possible */
526+
/* integer value stored in a `TCell.x` field. */
525527
#define CELL_MAX_X_VALUE INT_MAX
526528

527-
/* Return true iff |cell| points to the null cell. */
528-
#define CELL_IS_NULL(cell) ((cell)->x == CELL_MAX_X_VALUE)
529+
/* Return true iff |cell| points to the null cell. */
530+
#define CELL_IS_NULL( cell ) ( (cell)->x == CELL_MAX_X_VALUE )
529531

530532

531533
#define FT_INTEGRATE( ras, a, b ) \
@@ -556,7 +558,7 @@ typedef ptrdiff_t FT_PtrDist;
556558

557559
printf( "%3d:", y );
558560

559-
for ( ; !CELL_IS_NULL(cell); cell = cell->next )
561+
for ( ; !CELL_IS_NULL( cell ); cell = cell->next )
560562
printf( " (%3d, c:%4d, a:%6d)",
561563
cell->x, cell->cover, cell->area );
562564
printf( "\n" );
@@ -584,9 +586,11 @@ typedef ptrdiff_t FT_PtrDist;
584586
/* Note that if a cell is to the left of the clipping region, it is */
585587
/* actually set to the (min_ex-1) horizontal position. */
586588

587-
TCoord ey_index = ey - ras.min_ey;
589+
TCoord ey_index = ey - ras.min_ey;
590+
591+
588592
if ( ey_index < 0 || ey_index >= ras.count_ey || ex >= ras.max_ex )
589-
ras.cell = NULL_CELL_PTR(ras);
593+
ras.cell = NULL_CELL_PTR( ras );
590594
else
591595
{
592596
PCell* pcell = ras.ycells + ey_index;
@@ -610,7 +614,7 @@ typedef ptrdiff_t FT_PtrDist;
610614

611615
/* insert new cell */
612616
cell = ras.cell_free++;
613-
if (cell >= ras.cell_limit)
617+
if ( cell >= ras.cell_limit )
614618
ft_longjmp( ras.jump_buffer, 1 );
615619

616620
cell->x = ex;
@@ -978,6 +982,7 @@ typedef ptrdiff_t FT_PtrDist;
978982
}
979983

980984
gray_set_cell( RAS_VAR_ ex1, ey1 );
985+
981986
} while ( ex1 != ex2 || ey1 != ey2 );
982987
}
983988

@@ -987,30 +992,37 @@ typedef ptrdiff_t FT_PtrDist;
987992
FT_INTEGRATE( ras, fy2 - fy1, fx1 + fx2 );
988993

989994
End:
990-
ras.x = to_x;
991-
ras.y = to_y;
995+
ras.x = to_x;
996+
ras.y = to_y;
992997
}
993998

994999
#endif
9951000

996-
/* Benchmarking shows that using DDA to flatten the quadratic bezier
997-
* arcs is slightly faster in the following cases:
998-
*
999-
* - When the host CPU is 64-bit.
1000-
* - When SSE2 SIMD registers and instructions are available (even on x86).
1001-
*
1002-
* For other cases, using binary splits is actually slightly faster.
1003-
*/
1004-
#if defined(__SSE2__) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_ARM64)
1005-
#define BEZIER_USE_DDA 1
1001+
/*
1002+
* Benchmarking shows that using DDA to flatten the quadratic Bézier arcs
1003+
* is slightly faster in the following cases:
1004+
*
1005+
* - When the host CPU is 64-bit.
1006+
* - When SSE2 SIMD registers and instructions are available (even on
1007+
* x86).
1008+
*
1009+
* For other cases, using binary splits is actually slightly faster.
1010+
*/
1011+
#if defined( __SSE2__ ) || \
1012+
defined( __x86_64__ ) || \
1013+
defined( __aarch64__ ) || \
1014+
defined( _M_AMD64 ) || \
1015+
defined( _M_ARM64 )
1016+
# define BEZIER_USE_DDA 1
10061017
#else
1007-
#define BEZIER_USE_DDA 0
1018+
# define BEZIER_USE_DDA 0
10081019
#endif
10091020

1021+
10101022
#if BEZIER_USE_DDA
10111023

10121024
#ifdef __SSE2__
1013-
#include <emmintrin.h>
1025+
# include <emmintrin.h>
10141026
#endif
10151027

10161028
static void
@@ -1058,8 +1070,8 @@ typedef ptrdiff_t FT_PtrDist;
10581070
{
10591071
dx >>= 2;
10601072
shift += 1;
1061-
}
1062-
while (dx > ONE_PIXEL / 4);
1073+
1074+
} while ( dx > ONE_PIXEL / 4 );
10631075

10641076
/*
10651077
* The (P0,P1,P2) arc equation, for t in [0,1] range:
@@ -1102,12 +1114,17 @@ typedef ptrdiff_t FT_PtrDist;
11021114
* Q << 32 = (2 * B << (32 - N)) + (A << (32 - N - N))
11031115
* = (B << (33 - N)) + (A << (32 - N - N))
11041116
*/
1117+
11051118
#ifdef __SSE2__
1106-
/* Experience shows that for small shift values, SSE2 is actually slower. */
1107-
if (shift > 2) {
1108-
union {
1109-
struct { FT_Int64 ax, ay, bx, by; } i;
1110-
struct { __m128i a, b; } vec;
1119+
/* Experience shows that for small shift values, */
1120+
/* SSE2 is actually slower. */
1121+
if ( shift > 2 )
1122+
{
1123+
union
1124+
{
1125+
struct { FT_Int64 ax, ay, bx, by; } i;
1126+
struct { __m128i a, b; } vec;
1127+
11111128
} u;
11121129

11131130
u.i.ax = p0.x + p2.x - 2 * p1.x;
@@ -1138,10 +1155,11 @@ typedef ptrdiff_t FT_PtrDist;
11381155
p = _mm_add_epi64(p, q);
11391156
q = _mm_add_epi64(q, r);
11401157

1141-
_mm_store_si128(&v.vec, p);
1158+
_mm_store_si128( &v.vec, p );
11421159

1143-
gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi);
1160+
gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi );
11441161
}
1162+
11451163
return;
11461164
}
11471165
#endif /* !__SSE2__ */
@@ -1167,13 +1185,15 @@ typedef ptrdiff_t FT_PtrDist;
11671185
qx += rx;
11681186
qy += ry;
11691187

1170-
gray_render_line( RAS_VAR_ (FT_Pos)(px >> 32), (FT_Pos)(py >> 32));
1188+
gray_render_line( RAS_VAR_ (FT_Pos)( px >> 32 ),
1189+
(FT_Pos)( py >> 32 ) );
11711190
}
11721191
}
11731192

11741193
#else /* !BEZIER_USE_DDA */
11751194

1176-
/* Note that multiple attempts to speed up the function below
1195+
/*
1196+
* Note that multiple attempts to speed up the function below
11771197
* with SSE2 intrinsics, using various data layouts, have turned
11781198
* out to be slower than the non-SIMD code below.
11791199
*/
@@ -1264,12 +1284,14 @@ typedef ptrdiff_t FT_PtrDist;
12641284

12651285
#endif /* !BEZIER_USE_DDA */
12661286

1267-
/* For cubic bezier, binary splits are still faster than DDA
1287+
1288+
/*
1289+
* For cubic Bézier, binary splits are still faster than DDA
12681290
* because the splits are adaptive to how quickly each sub-arc
12691291
* approaches their chord trisection points.
12701292
*
12711293
* It might be useful to experiment with SSE2 to speed up
1272-
* gray_split_cubic() though.
1294+
* `gray_split_cubic`, though.
12731295
*/
12741296
static void
12751297
gray_split_cubic( FT_Vector* base )
@@ -1361,6 +1383,7 @@ typedef ptrdiff_t FT_PtrDist;
13611383
}
13621384
}
13631385

1386+
13641387
static int
13651388
gray_move_to( const FT_Vector* to,
13661389
gray_PWorker worker )
@@ -1428,7 +1451,7 @@ typedef ptrdiff_t FT_PtrDist;
14281451
unsigned char* line = ras.target.origin - ras.target.pitch * y;
14291452

14301453

1431-
for ( ; !CELL_IS_NULL(cell); cell = cell->next )
1454+
for ( ; !CELL_IS_NULL( cell ); cell = cell->next )
14321455
{
14331456
if ( cover != 0 && cell->x > x )
14341457
{
@@ -1476,7 +1499,7 @@ typedef ptrdiff_t FT_PtrDist;
14761499
TArea area;
14771500

14781501

1479-
for ( ; !CELL_IS_NULL(cell); cell = cell->next )
1502+
for ( ; !CELL_IS_NULL( cell ); cell = cell->next )
14801503
{
14811504
if ( cover != 0 && cell->x > x )
14821505
{
@@ -1898,19 +1921,19 @@ typedef ptrdiff_t FT_PtrDist;
18981921
/* memory management */
18991922
n = ( height * sizeof ( PCell ) + sizeof ( TCell ) - 1 ) / sizeof ( TCell );
19001923

1901-
ras.cells = buffer + n;
1902-
ras.max_cells = (FT_PtrDist)( FT_MAX_GRAY_POOL - n );
1924+
ras.cells = buffer + n;
1925+
ras.max_cells = (FT_PtrDist)( FT_MAX_GRAY_POOL - n );
19031926
ras.cell_limit = ras.cells + ras.max_cells;
1904-
ras.ycells = (PCell*)buffer;
1927+
ras.ycells = (PCell*)buffer;
19051928

1906-
/* Initialize the null cell is at the start of the 'cells' array. */
1907-
/* Note that this requires ras.cell_free initialization to skip */
1908-
/* over the first entry in the array. */
1909-
PCell null_cell = NULL_CELL_PTR(ras);
1910-
null_cell->x = CELL_MAX_X_VALUE;
1911-
null_cell->area = 0;
1912-
null_cell->cover = 0;
1913-
null_cell->next = NULL;;
1929+
/* Initialize the null cell at the start of the `cells` array. */
1930+
/* Note that this requires `ras.cell_free` initialization to skip */
1931+
/* over the first entry in the array. */
1932+
PCell null_cell = NULL_CELL_PTR( ras );
1933+
null_cell->x = CELL_MAX_X_VALUE;
1934+
null_cell->area = 0;
1935+
null_cell->cover = 0;
1936+
null_cell->next = NULL;;
19141937

19151938
for ( y = yMin; y < yMax; )
19161939
{
@@ -1928,7 +1951,8 @@ typedef ptrdiff_t FT_PtrDist;
19281951
TCoord w;
19291952
int error;
19301953

1931-
for (w = 0; w < width; ++w)
1954+
1955+
for ( w = 0; w < width; ++w )
19321956
ras.ycells[w] = null_cell;
19331957

19341958
ras.cell_free = ras.cells + 1; /* NOTE: Skip over the null cell. */

0 commit comments

Comments
 (0)