kmx git

#include "hb-aat-var-hvgl-table.hh"

#ifdef __APPLE__
// For endianness
#include <CoreFoundation/CoreFoundation.h>
#endif

#if defined(HB_NO_SIMD)
#define HB_NO_APPLE_SIMD
#endif

#if !defined(HB_NO_APPLE_SIMD) && !(defined(__APPLE__) && \
  (!defined(MAC_OS_X_VERSION_MIN_REQUIRED) || MAC_OS_X_VERSION_MIN_REQUIRED >= 101300) \
)
#define HB_NO_APPLE_SIMD
#endif

#ifndef HB_NO_APPLE_SIMD
#include <simd/simd.h> // Apple SIMD https://developer.apple.com/documentation/accelerate/simd
#endif

#ifndef HB_NO_SIMD
#ifdef __AVX2__
#include <immintrin.h>
#endif
#endif


#ifndef HB_NO_VAR_HVF

namespace AAT {

namespace hvgl_impl {


enum
segment_point_t
{
  SEGMENT_POINT_ON_CURVE_X = 0,
  SEGMENT_POINT_ON_CURVE_Y = 1,
  SEGMENT_POINT_OFF_CURVE_X = 2,
  SEGMENT_POINT_OFF_CURVE_Y = 3,
};

enum
blend_type_t
{
  BLEND_TYPE_CURVE = 0,
  BLEND_TYPE_CORNER = 1,
  BLEND_TYPE_TANGENT = 2,
  BLEND_TYPE_TANGENT_PAIR_FIRST = 3,
  BLEND_TYPE_TANGENT_PAIR_SECOND = 4,
};

using segment_t = double*;

static void
project_on_curve_to_tangent (const segment_t offcurve1,
			     segment_t oncurve,
			     const segment_t offcurve2)
{
  double &x = oncurve[SEGMENT_POINT_ON_CURVE_X];
  double &y = oncurve[SEGMENT_POINT_ON_CURVE_Y];

  double x1 = offcurve1[SEGMENT_POINT_OFF_CURVE_X];
  double y1 = offcurve1[SEGMENT_POINT_OFF_CURVE_Y];
  double x2 = offcurve2[SEGMENT_POINT_OFF_CURVE_X];
  double y2 = offcurve2[SEGMENT_POINT_OFF_CURVE_Y];

  double dx = x2 - x1;
  double dy = y2 - y1;

  double l2 = dx * dx + dy * dy;
  double t = l2 ? (dx * (x - x1) + dy * (y - y1)) / l2 : 0;
  t = hb_clamp (t, 0, 1);

  x = x1 + dx * t;
  y = y1 + dy * t;
}

#ifndef HB_NO_SIMD
#ifdef __AVX2__
__attribute__((target("avx2")))
#endif
#ifdef __FMA__
__attribute__((target("fma")))
#endif
#endif
void
PartShape::get_path_at (const hb_hvgl_context_t *c,
		        hb_array_t<const double> coords,
			hb_array_t<hb_transform_t<double>> transforms) const
{
  hb_transform_t<double> transform = transforms[0];

  const auto &blendTypes = StructAfter<decltype (blendTypesX)> (segmentCountPerPath, pathCount);

  const auto &padding = StructAfter<decltype (paddingX)> (blendTypes, segmentCount);
  const auto &coordinates = StructAfter<decltype (coordinatesX)> (padding, this);

  auto a = coordinates.get_coords (segmentCount);
  auto &v = c->scratch.points;

#ifdef __BYTE_ORDER
  constexpr bool le = __BYTE_ORDER == __LITTLE_ENDIAN;
#elif defined(__APPLE__)
  bool le = CFByteOrderGetCurrent () == CFByteOrderLittleEndian;
#else
  constexpr bool le = false;
#endif

  if (le)
  {
    // Endianness matches; Faster to memcpy().
    v.resize (a.length, false);
    memcpy (v.arrayZ, a.arrayZ, v.length * sizeof (v[0]));
  }
  else
  {
    v.resize (0);
    v.extend (a);
  }

  if (unlikely (v.in_error ()))
    return;

  coords = coords.sub_array (0, axisCount);
  // Apply deltas
  if (coords)
  {
    unsigned rows_count = v.length;
    const auto &deltas = StructAfter<decltype (deltasX)> (coordinates, segmentCount);
    const auto matrix = deltas.get_matrix (axisCount, segmentCount).arrayZ;
    unsigned axis_count = coords.length;
    unsigned axis_index = 0;

#ifndef HB_NO_APPLE_SIMD
    // APPLE SIMD

    bool src_aligned = (uintptr_t) matrix % 8 == 0;
    // dest is always aligned.
    if (le && src_aligned)
    {
      hb_barrier ();
      simd_double4 coords4;
      unsigned column_idx[4];
      while (axis_index < axis_count)
      {
	unsigned j;
	for (j = 0; j < 4; j++)
	{
	  while (axis_index < axis_count && !coords.arrayZ[axis_index])
	    axis_index++;
	  if (axis_index >= axis_count)
	  {
	    if (!j)
	      break;
	    for (; j < 4; j++)
	    {
	      coords4[j] = 0.;
	      column_idx[j] = 0;
	    }
	    break;
	  }
	  double coord = (double) coords.arrayZ[axis_index];
	  coords4[j] = coord;
	  bool pos = coord > 0.;
	  column_idx[j] = axis_index * 2 + pos;
	  axis_index++;
	}
	if (!j)
	  break;

	simd_double4 scalar4 = simd_abs (coords4);
	auto *delta0 = matrix + column_idx[0] * rows_count;
	auto *delta1 = matrix + column_idx[1] * rows_count;
	auto *delta2 = matrix + column_idx[2] * rows_count;
	auto *delta3 = matrix + column_idx[3] * rows_count;

	// Note: Count is always a multiple of 4
	for (unsigned i = 0; i + 4 <= rows_count; i += 4)
	{
	  const auto &src0 = * (const simd_packed_double4 *) (void *) (delta0 + i);
	  const auto &src1 = * (const simd_packed_double4 *) (void *) (delta1 + i);
	  const auto &src2 = * (const simd_packed_double4 *) (void *) (delta2 + i);
	  const auto &src3 = * (const simd_packed_double4 *) (void *) (delta3 + i);
	  const auto matrix = simd_matrix (src0, src1, src2, src3);

	  * (simd_packed_double4 *) (v.arrayZ + i) += simd_mul (matrix, scalar4);
	}
      }
    }
#endif

    for (; axis_index < axis_count; axis_index++)
    {
      double coord = coords.arrayZ[axis_index];
      if (!coord) continue;
      bool pos = coord > 0.;
      unsigned column_idx = axis_index * 2 + pos;
      double scalar = fabs(coord);

      const auto *src = matrix + column_idx * rows_count;
      auto *dest = v.arrayZ;
      unsigned i = 0;

#ifndef HB_NO_SIMD
      if (le && rows_count > 4)
      {
#ifdef __AVX2__
	{
	  __m256d scalar_vec = _mm256_set1_pd(scalar);
	  for (; i + 4 <= rows_count; i += 4)
	  {
	    __m256d src_vec = _mm256_loadu_pd ((double *) &src[i]);
	    __m256d dest_vec = _mm256_loadu_pd (&dest[i]);
	    __m256d result =
#ifdef __FMA__
	      true ? _mm256_fmadd_pd (src_vec, scalar_vec, dest_vec) :
#endif
	      _mm256_add_pd (_mm256_mul_pd(src_vec, scalar_vec), dest_vec);

	    _mm256_storeu_pd (&dest[i], result);
	  }
	}
#endif
      }
#endif

      // This loop is really hot
      for (; i + 4 <= rows_count; i += 4)
      {
	dest[i] += src[i] * scalar;
	dest[i + 1] += src[i + 1] * scalar;
	dest[i + 2] += src[i + 2] * scalar;
	dest[i + 3] += src[i + 3] * scalar;
      }
      // Note: Count is always a multiple of 4
      // So, the following not needed.
      if (false)
	for (; i < rows_count; i++)
	  dest[i] += src[i] * scalar;
    }
  }

  // Resolve blend types, one path at a time, and draw.
  unsigned start = 0;
  for (unsigned pathSegmentCount : segmentCountPerPath.as_array (pathCount))
  {
    unsigned end = start + pathSegmentCount;

    if (unlikely (end * 4 > v.length))
      break;

    if (unlikely (start == end))
      continue;

    // Resolve blend types
    {
      segment_t segment = &v.arrayZ[(end - 1) * 4];
      for (unsigned i = start; i < end; i++)
      {
	unsigned blendType = blendTypes.arrayZ[i];
	const segment_t prev_segment = segment;
	segment = &v.arrayZ[i * 4];

	switch (blendType)
	{
	default:
	  break;

	case BLEND_TYPE_CURVE:
	  {
	    double t = segment[SEGMENT_POINT_ON_CURVE_X];
	    t = hb_clamp (t, 0, 1);

	    /* Interpolate between the off-curve points */
	    double x = prev_segment[SEGMENT_POINT_OFF_CURVE_X] + (segment[SEGMENT_POINT_OFF_CURVE_X] - prev_segment[SEGMENT_POINT_OFF_CURVE_X]) * t;
	    double y = prev_segment[SEGMENT_POINT_OFF_CURVE_Y] + (segment[SEGMENT_POINT_OFF_CURVE_Y] - prev_segment[SEGMENT_POINT_OFF_CURVE_Y]) * t;

	    segment[SEGMENT_POINT_ON_CURVE_X] = x;
	    segment[SEGMENT_POINT_ON_CURVE_Y] = y;
	  }
	  break;

	case BLEND_TYPE_CORNER:
	  break;

	case BLEND_TYPE_TANGENT:
	  {
	    /* Project onto the line between the off-curve point
	     * of the previous segment and the off-curve point of
	     * this segment */
	    project_on_curve_to_tangent (prev_segment, segment, segment);
	  }
	  break;

	case BLEND_TYPE_TANGENT_PAIR_FIRST:
	  {
	    unsigned next_i = i == end - 1 ? start : i + 1;
	    segment_t next_segment = &v.arrayZ[next_i * 4];

	    project_on_curve_to_tangent (prev_segment, segment, next_segment);
	    project_on_curve_to_tangent (prev_segment, next_segment, next_segment);
	  }
	  break;
	}
      }
    }

    // Draw
    {
      segment_t next_segment = &v.arrayZ[start * 4];
      double x0 = next_segment[SEGMENT_POINT_ON_CURVE_X];
      double y0 = next_segment[SEGMENT_POINT_ON_CURVE_Y];
      transform.transform_point (x0, y0);
      if (c->draw_session)
	c->draw_session->move_to ((float) x0, (float) y0);
      else if (c->extents)
	c->extents->add_point ((float) x0, (float) y0);
      for (unsigned i = start; i < end; i++)
      {
	segment_t segment = next_segment;
	unsigned next_i = i == end - 1 ? start : i + 1;
	next_segment = &v.arrayZ[next_i * 4];

	double x1 = segment[SEGMENT_POINT_OFF_CURVE_X];
	double y1 = segment[SEGMENT_POINT_OFF_CURVE_Y];
	double x2 = next_segment[SEGMENT_POINT_ON_CURVE_X];
	double y2 = next_segment[SEGMENT_POINT_ON_CURVE_Y];
	transform.transform_point (x1, y1);
	transform.transform_point (x2, y2);
	if (c->draw_session)
	  c->draw_session->quadratic_to ((float) x1, (float) y1, (float) x2, (float) y2);
	else if (c->extents)
	{
	  c->extents->add_point ((float) x1, (float) y1);
	  c->extents->add_point ((float) x2, (float) y2);
	}
      }
      if (c->draw_session)
	c->draw_session->close_path ();
    }

    start = end;
  }
}

void
PartComposite::apply_to_coords (hb_array_t<double> out_coords,
				hb_array_t<const double> coords) const
{
  const auto &ecs = StructAtOffset<ExtremumColumnStarts> (this, extremumColumnStartsOff4 * 4);
  const auto &extremumColumnStart = ecs.extremumColumnStart;
  const auto &masterRowIndex = StructAfter<decltype (ecs.masterRowIndexX)> (ecs.extremumColumnStart, 2 * axisCount + 1);
  const auto &extremumRowIndex = StructAfter<decltype (ecs.extremumRowIndexX)> (masterRowIndex, sparseMasterAxisValueCount);

  const auto &masterAxisValueDeltas = StructAtOffset<MasterAxisValueDeltas> (this, masterAxisValueDeltasOff4 * 4);
  hb_array_t<const HBFLOAT32LE> master_axis_value_deltas = masterAxisValueDeltas.as_array (sparseMasterAxisValueCount);
  const auto &extremumAxisValueDeltas = StructAtOffset<ExtremumAxisValueDeltas> (this, extremumAxisValueDeltasOff4 * 4);
  hb_array_t<const HBFLOAT32LE> extremum_axis_value_deltas = extremumAxisValueDeltas.as_array (sparseExtremumAxisValueCount);

  unsigned count = master_axis_value_deltas.length;
  for (unsigned i = 0; i < count; i++)
    out_coords[masterRowIndex.arrayZ[i]] += (double) master_axis_value_deltas.arrayZ[i];

  unsigned axis_count = hb_min (axisCount, coords.length);
  for (unsigned axis_idx = 0; axis_idx < axis_count; axis_idx++)
  {
    double coord = coords.arrayZ[axis_idx];
    if (!coord) continue;
    bool pos = coord > 0.;
    unsigned column_idx = axis_idx * 2 + pos;

    unsigned sparse_row_start = extremumColumnStart.arrayZ[column_idx];
    unsigned sparse_row_end = extremumColumnStart.arrayZ[column_idx + 1];
    if (sparse_row_start == sparse_row_end)
      continue;

    double scalar = fabs (coord);
    sparse_row_end = hb_min (sparse_row_end, extremum_axis_value_deltas.length);
    for (unsigned row_idx = sparse_row_start; row_idx < sparse_row_end; row_idx++)
    {
      unsigned row = extremumRowIndex.arrayZ[row_idx];
      double delta = (double) extremum_axis_value_deltas.arrayZ[row_idx];
      out_coords[row] += delta * scalar;
    }
  }
}

void
PartComposite::apply_to_transforms (hb_array_t<hb_transform_t<double>> transforms,
				    hb_array_t<const double> coords) const
{
  const auto &allTranslations = StructAtOffset<AllTranslations> (this, allTranslationsOff4 * 4);
  const auto &masterTranslationDelta = allTranslations.masterTranslationDelta;
  const auto &extremumTranslationDelta = StructAfter<decltype (allTranslations.extremumTranslationDeltaX)> (masterTranslationDelta, sparseMasterTranslationCount);
  const auto &extremumTranslationIndex = StructAfter<decltype (allTranslations.extremumTranslationIndexX)> (extremumTranslationDelta, sparseExtremumTranslationCount);
  const auto &masterTranslationIndex = StructAfter<decltype (allTranslations.masterTranslationIndexX)> (extremumTranslationIndex, sparseExtremumTranslationCount);

  const auto &allRotations = StructAtOffset<AllRotations> (this, allRotationsOff4 * 4);
  const auto &masterRotationDelta = allRotations.masterRotationDelta;
  const auto &extremumRotationDelta = StructAfter<decltype (allRotations.extremumRotationDeltaX)> (masterRotationDelta, sparseMasterRotationCount);
  const auto &extremumRotationIndex = StructAfter<decltype (allRotations.extremumRotationIndexX)> (extremumRotationDelta, sparseExtremumRotationCount);
  const auto &masterRotationIndex = StructAfter<decltype (allRotations.masterRotationIndexX)> (extremumRotationIndex, sparseExtremumRotationCount);

  /* Note that the spec says walk four iterators together.
   * But with careful consideration, we have figured out the order
   * to walk two, then one, then one. This seems to work for all
   * glyphs in PingFangUI just fine. For commits moving to this
   * logic.
   *
   * Moreover, for walkting the two (extremum ones), if there is
   * no rotation, we use a separate, faster, loop that just walks
   * extremum translations.
   *
   * See the following commits:
   *
   *   [hvgl/transforms] Break up the four-iterator loop again
   *   [hvgl/transforms] Break up some more
   *   [hvgl] Fast-path when no extremum rotations are present
   *
   */

  auto extremum_translation_indices = extremumTranslationIndex.arrayZ;
  auto extremum_translation_deltas = extremumTranslationDelta.arrayZ;
  unsigned extremum_translation_count = sparseExtremumTranslationCount;
  auto extremum_rotation_indices = extremumRotationIndex.arrayZ;
  auto extremum_rotation_deltas = extremumRotationDelta.arrayZ;
  unsigned extremum_rotation_count = sparseExtremumRotationCount;

  if (!extremum_rotation_count)
  {
    for (unsigned i = 0; i < extremum_translation_count; i++)
    {
      unsigned column = extremum_translation_indices[i].column;

      unsigned axis_idx = column / 2;
      double coord = coords[axis_idx];
      if (!coord) continue;
      bool pos = column & 1;
      if (pos != (coord > 0)) continue;
      double scalar = fabs (coord);

      unsigned row = extremum_translation_indices[i].row;
      if (unlikely (row >= transforms.length)) break;

      transforms.arrayZ[row].translate ((double) extremum_translation_deltas[i].x * scalar,
					(double) extremum_translation_deltas[i].y * scalar,
					true);
    }
  }
  else
  {
    while (true)
    {
      unsigned row = transforms.length;
      if (extremum_translation_count)
	row = hb_min (row, extremum_translation_indices->row);
      if (extremum_rotation_count)
	row = hb_min (row, extremum_rotation_indices->row);
      if (row == transforms.length)
	break;

      hb_transform_t<double> transform;
      bool is_translate_only = true;

      while (true)
      {
	bool has_row_translation = extremum_translation_count &&
				   extremum_translation_indices->row == row;
	bool has_row_rotation = extremum_rotation_count &&
				extremum_rotation_indices->row == row;

	unsigned column = 2 * axisCount;
	if (has_row_translation)
	  column = hb_min (column, extremum_translation_indices->column);
	if (has_row_rotation)
	  column = hb_min (column, extremum_rotation_indices->column);
	if (column == 2 * axisCount)
	  break;

	const auto *extremum_translation_delta = &Null(TranslationDelta);
	double extremum_rotation_delta = 0.;

	if (has_row_translation &&
	    extremum_translation_indices->column == column)
	{
	  extremum_translation_delta = extremum_translation_deltas;
	  extremum_translation_count--;
	  extremum_translation_indices++;
	  extremum_translation_deltas++;
	}
	if (has_row_rotation &&
	    extremum_rotation_indices->column == column)
	{
	  extremum_rotation_delta = (double) *extremum_rotation_deltas;
	  extremum_rotation_count--;
	  extremum_rotation_indices++;
	  extremum_rotation_deltas++;
	}

	unsigned axis_idx = column / 2;
	double coord = coords[axis_idx];
	if (!coord) continue;
	bool pos = column & 1;
	if (pos != (coord > 0)) continue;
	double scalar = fabs (coord);

	if (extremum_rotation_delta)
	{
	  double center_x = (double) extremum_translation_delta->x;
	  double center_y = (double) extremum_translation_delta->y;
	  double angle = extremum_rotation_delta;
	  if (center_x || center_y)
	  {
	    // The paper has formula for this in terms of complex numbers.
	    // This is translated to real numbers, partly using ChatGPT.
	    double s, c;
	    hb_sincos ((double) angle, s, c);
	    double _1_minus_c = 1 - c;
	    if (likely (_1_minus_c))
	    {
	      double s_over_1_minus_c = s / _1_minus_c;

	      double new_center_x = (center_x - center_y * s_over_1_minus_c) * .5;
	      double new_center_y = (center_y + center_x * s_over_1_minus_c) * .5;

	      center_x = new_center_x;
	      center_y = new_center_y;
	    }
	  }
	  angle *= scalar;
	  transform.rotate_around_center (angle, center_x, center_y);
	  is_translate_only = false;
	}
	else
	{
	  // No rotation, just scale the translate
	  transform.translate ((double) extremum_translation_delta->x * scalar,
			       (double) extremum_translation_delta->y * scalar,
			       is_translate_only);
	}
      }

      if (is_translate_only)
	transforms.arrayZ[row].translate (transform.x0, transform.y0, true);
      else
	transforms.arrayZ[row].transform (transform, true);
    }
  }

  auto master_rotation_indices = masterRotationIndex.arrayZ;
  auto master_rotation_deltas = masterRotationDelta.arrayZ;
  unsigned master_rotation_count = sparseMasterRotationCount;
  for (unsigned i = 0; i < master_rotation_count; i++)
  {
    unsigned row = master_rotation_indices[i];
    if (unlikely (row >= transforms.length)) break;
    transforms.arrayZ[row].rotate ((double) master_rotation_deltas[i], true);
  }

  auto master_translation_indices = masterTranslationIndex.arrayZ;
  auto master_translation_deltas = masterTranslationDelta.arrayZ;
  unsigned master_translation_count = sparseMasterTranslationCount;
  for (unsigned i = 0; i < master_translation_count; i++)
  {
    unsigned row = master_translation_indices[i];
    if (unlikely (row >= transforms.length)) break;
    transforms.arrayZ[row].translate ((double) master_translation_deltas[i].x,
				      (double) master_translation_deltas[i].y,
				      true);
  }
}

void
PartComposite::get_path_at (const hb_hvgl_context_t *c,
			    hb_array_t<double> coords,
			    hb_array_t<hb_transform_t<double>> transforms) const
{
  const auto &subParts = StructAtOffset<SubParts> (this, subPartsOff4 * 4);

  coords = coords.sub_array (0, totalNumAxes);
  auto coords_head = coords.sub_array (0, axisCount);
  auto coords_tail = coords.sub_array (axisCount);

  apply_to_coords (coords_tail, coords_head);

  transforms = transforms.sub_array (0, totalNumParts);
  auto &transforms_head = transforms[0];
  auto transforms_tail = transforms.sub_array (1);

  apply_to_transforms (transforms_tail, coords_head);

  for (const auto &subPart : subParts.as_array (subPartCount))
  {
    auto &this_transform = transforms_tail[subPart.treeTransformIndex];

    if (likely (this_transform.is_translation ()))
    {
      double dx = this_transform.x0;
      double dy = this_transform.y0;
      this_transform = transforms_head;
      this_transform.translate (dx, dy);
    }
    else
      this_transform.transform (transforms_head, true);

    c->hvgl_table.get_part_path_at (c,
				    subPart.partIndex,
				    coords_tail.sub_array (subPart.treeAxisIndex),
				    transforms_tail.sub_array (subPart.treeTransformIndex));
  }
}


} // namespace hvgl_impl
} // namespace AAT

#endif
kc3-lang/harfbuzz/src/hb-aat-var-hvgl-table.cc

Commit

src/hb-aat-var-hvgl-table.cc