A bug when index num_den_graphs at LFMMILoss.forward #739

Jarvan-Wang · 2021-05-14T06:26:40Z

I mentioned this bug in #730 (comment)
but that issue is about decoding error.

After reading and digging into codes, I finally found the reasons, here it is:

        num_den_reordered_graphs = k2.index(num_den_graphs, num_den_graphs_indexes)

(Pdb) p num_graphs_indexes
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
        54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
        72, 73, 74, 75, 76], dtype=torch.int32)
(Pdb) p den_graphs_indexes
tensor([77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77,
        77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77,
        77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77,
        77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77,
        77, 77, 77, 77, 77], dtype=torch.int32)
(Pdb) p num_graphs.arcs_as_tensor().shape
torch.Size([19954])
(Pdb) p den_graphs.arcs_as_tensor().shape
torch.Size([167316922])

static RaggedShape IndexAxis0(RaggedShape &src, const Array1<int32_t> &new2old,
                              Array1<int32_t> *elem_indexes /*=nullptr*/) {
  NVTX_RANGE(K2_FUNC);
  ContextPtr &c = src.Context();
  bool is_cpu = (c->GetDeviceType() == kCpu);
  K2_CHECK(IsCompatible(src, new2old));
  int32_t num_axes = src.NumAxes(), src_dim0 = src.Dim0(),
          ans_dim0 = new2old.Dim();
  if (ans_dim0 == 0) {
    if (elem_indexes) *elem_indexes = Array1<int32_t>(c, 0);
    return EmptyRaggedShape(c, num_axes);
  }


  Array2<int32_t> old_offsets,  // num_axes by ans_dim0
      new_offsets;              // num_axes by (ans_dim0 + 1).
  GetOldAndNewOffsets(src, new2old, &old_offsets, &new_offsets);

  Array1<int32_t> tot_sizes_out =
      Array1<int32_t>(new_offsets.Col(ans_dim0)).To(GetCpuContext());

  if (elem_indexes) *elem_indexes = Array1<int32_t>(c, tot_sizes_out.Back());

some values before prefix sum of new_offsets

ExclusiveSum(*new_offsets, new_offsets)

are

src.NumElements()==41834178
src.Dim0()==78
old_offsets(2,1)==4949 //which means before idx0=77, there are 4949 elements
                       //and at idx0=77, there are 41834178-4949==41829229 elements
new_offsets(2,1)==41829229 //this matches with the num above
(gdb) parray2 new_offsets
$60 = {1 <repeats 154 times>, 16777472}
$61 = {20, 9146, 40, 9146, 56, 9146, 28, 9146, 28, 9146, 28, 9146, 28, 9146, 28, 9146, 20, 9146, 32, 9146, 28, 9146, 24, 9146, 16, 9146, 44, 9146,
  20, 9146, 40, 9146, 32, 9146, 32, 9146, 20, 9146, 32, 9146, 16, 9146, 36, 9146, 20, 9146, 36, 9146, 24, 9146, 28, 9146, 24, 9146, 24, 9146, 32,
  9146, 36, 9146, 40, 9146, 16, 9146, 24, 9146, 24, 9146, 16, 9146, 24, 9146, 16, 9146, 40, 9146, 20, 9146, 16, 9146, 16, 9146, 52, 9146, 16,
  9146, 36, 9146, 16, 9146, 16, 9146, 16, 9146, 20, 9146, 32, 9146, 20, 9146, 16, 9146, 28, 9146, 28, 9146, 28, 9146, 32, 9146, 16, 9146, 24,
  9146, 24, 9146, 24, 9146, 20, 9146, 36, 9146, 44, 9146, 20, 9146, 44, 9146, 40, 9146, 24, 9146, 20, 9146, 28, 9146, 20, 9146, 16, 9146, 36,
  9146, 28, 9146, 20, 9146, 36, 9146, 20, 9146, 28, 9146, 24, 9146, 16777472}
$62 = {47, 41829229, 97, 41829229, 137, 41829229, 67, 41829229, 67, 41829229, 67, 41829229, 67, 41829229, 67, 41829229, 47, 41829229, 77,
  41829229, 67, 41829229, 57, 41829229, 37, 41829229, 107, 41829229, 47, 41829229, 97, 41829229, 77, 41829229, 77, 41829229, 47, 41829229, 77,
  41829229, 37, 41829229, 87, 41829229, 47, 41829229, 87, 41829229, 57, 41829229, 67, 41829229, 57, 41829229, 57, 41829229, 77, 41829229, 87,
  41829229, 97, 41829229, 37, 41829229, 57, 41829229, 57, 41829229, 37, 41829229, 57, 41829229, 37, 41829229, 97, 41829229, 47, 41829229, 37,
  41829229, 37, 41829229, 127, 41829229, 37, 41829229, 87, 41829229, 37, 41829229, 37, 41829229, 37, 41829229, 47, 41829229, 77, 41829229, 47,
  41829229, 37, 41829229, 67, 41829229, 67, 41829229, 67, 41829229, 77, 41829229, 37, 41829229, 57, 41829229, 57, 41829229, 57, 41829229, 47,
  41829229, 87, 41829229, 107, 41829229, 47, 41829229, 107, 41829229, 97, 41829229, 57, 41829229, 47, 41829229, 67, 41829229, 47, 41829229, 37,
  41829229, 87, 41829229, 67, 41829229, 47, 41829229, 87, 41829229, 47, 41829229, 67, 41829229, 57, 41829229, 16777217}

after prefix sum,
new_offsets is

(gdb) parray2 new_offsets
$63 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
  37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
  73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
  107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
  136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154}
$64 = {0, 20, 9166, 9206, 18352, 18408, 27554, 27582, 36728, 36756, 45902, 45930, 55076, 55104, 64250, 64278, 73424, 73444, 82590, 82622, 91768,
  91796, 100942, 100966, 110112, 110128, 119274, 119318, 128464, 128484, 137630, 137670, 146816, 146848, 155994, 156026, 165172, 165192, 174338,
  174370, 183516, 183532, 192678, 192714, 201860, 201880, 211026, 211062, 220208, 220232, 229378, 229406, 238552, 238576, 247722, 247746, 256892,
  256924, 266070, 266106, 275252, 275292, 284438, 284454, 293600, 293624, 302770, 302794, 311940, 311956, 321102, 321126, 330272, 330288, 339434,
  339474, 348620, 348640, 357786, 357802, 366948, 366964, 376110, 376162, 385308, 385324, 394470, 394506, 403652, 403668, 412814, 412830, 421976,
  421992, 431138, 431158, 440304, 440336, 449482, 449502, 458648, 458664, 467810, 467838, 476984, 477012, 486158, 486186, 495332, 495364, 504510,
  504526, 513672, 513696, 522842, 522866, 532012, 532036, 541182, 541202, 550348, 550384, 559530, 559574, 568720, 568740, 577886, 577930, 587076,
  587116, 596262, 596286, 605432, 605452, 614598, 614626, 623772, 623792, 632938, 632954, 642100, 642136, 651282, 651310, 660456, 660476, 669622,
  669658, 678804, 678824, 687970, 687998, 697144, 697168, 706314}
$65 = {0, 47, 41829276, 41829373, 83658602, 83658739, 125487968, 125488035, 167317264, 167317331, 209146560, 209146627, 250975856, 250975923,
  292805152, 292805219, 334634448, 334634495, 376463724, 376463801, 418293030, 418293097, 460122326, 460122383, 501951612, 501951649, 543780878,
  543780985, 585610214, 585610261, 627439490, 627439587, 669268816, 669268893, 711098122, 711098199, 752927428, 752927475, 794756704, 794756781,
  836586010, 836586047, 878415276, 878415363, 920244592, 920244639, 962073868, 962073955, 1003903184, 1003903241, 1045732470, 1045732537,
  1087561766, 1087561823, 1129391052, 1129391109, 1171220338, 1171220415, 1213049644, 1213049731, 1254878960, 1254879057, 1296708286, 1296708323,
  1338537552, 1338537609, 1380366838, 1380366895, 1422196124, 1422196161, 1464025390, 1464025447, 1505854676, 1505854713, 1547683942, 1547684039,
  1589513268, 1589513315, 1631342544, 1631342581, 1673171810, 1673171847, 1715001076, 1715001203, 1756830432, 1756830469, 1798659698, 1798659785,
  1840489014, 1840489051, 1882318280, 1882318317, 1924147546, 1924147583, 1965976812, 1965976859, 2007806088, 2007806165, 2049635394, 2049635441,
  2091464670, 2091464707, 2133293936, 2133294003, -2119844064, -2119843997, -2078014768, -2078014701, -2036185472, -2036185395, -1994356166,
  -1994356129, -1952526900, -1952526843, -1910697614, -1910697557, -1868868328, -1868868271, -1827039042, -1827038995, -1785209766, -1785209679,
  -1743380450, -1743380343, -1701551114, -1701551067, -1659721838, -1659721731, -1617892502, -1617892405, -1576063176, -1576063119, -1534233890,
  -1534233843, -1492404614, -1492404547, -1450575318, -1450575271, -1408746042, -1408746005, -1366916776, -1366916689, -1325087460, -1325087393,
  -1283258164, -1283258117, -1241428888, -1241428801, -1199599572, -1199599525, -1157770296, -1157770229, -1115941000, -1115940943, -1074111714}

after new_offsets[2][103], it's began to overflow.

now the problem is clear, when den graph is large, pair every num graph with the same big den graph will cause the issue.

The text was updated successfully, but these errors were encountered:

Jarvan-Wang · 2021-05-18T09:36:20Z

After I reduce the batch_size, no overflow at the point above.
But, when:

        num_den_lats = k2.intersect_dense(num_den_reordered_graphs,
                                          dense_fsa_vec,
                                          output_beam=10.0,
                                          a_to_b_map=a_to_b_map)

Got:

MemoryError: std::bad_alloc

After debug, I found the reason:

k2/csrc/algorithms.h:277
template <typename LambdaT>
__forceinline__ void GetNew2OldAndRowIds(
    Array1<int32_t> &row_splits,
    int32_t num_elems, /* num_elems==-1498158200 */
    LambdaT &lambda,
    Array1<int32_t> *new2old_out,
    Array1<int32_t> *new_row_ids_out,
    int32_t max_array_size = (1 << 20)) {
  //...
  // num_arrays == -1427
  int32_t num_arrays = (num_elems + max_array_size - 1) / max_array_size,
            num_rows = row_splits.Dim() - 1,
  //...
  // try to alloc a vector with -1427
    std::vector<Array1<int32_t> > new2old(num_arrays),
      row_ids(num_arrays);

in the upper frame:

k2/csrc/intersect_dense.cu

  FsaVec FormatOutput(Array1<int32_t> *arc_map_a,
                      Array1<int32_t> *arc_map_b) {
    //...
      Array1<int32_t> &ans_row_splits3(ans_num_arcs);
    ExclusiveSum(ans_num_arcs, &ans_row_splits3);
    // from index 1638083, the element of ans_row_splits3 began to overflow
    // tot_arcs == -1498091009
    int32_t tot_arcs = ans_row_splits3.Back();
    //...
    GetNew2OldAndRowIds(ans_row_splits3, tot_arcs, lambda_set_keep,
                    &arcs_new2old, &ans_row_ids3_subsampled);

Again, it's due to the offset's type int32_t

@danpovey @csukuangfj
Is there any plan to fix this issue and #730 ?
If not, tell me plz, I have waste much times on it.

csukuangfj · 2021-05-18T09:44:22Z

After I reduce the batch_size, no overflow at the point above.

Could you reduce your --max-duration to reduce the problem size further?

Is there any plan to fix this issue and #730 ?

I think @danpovey has something to say about this.

Jarvan-Wang · 2021-05-18T09:52:53Z

After I reduce the batch_size, no overflow at the point above.

Could you reduce your --max-duration to reduce the problem size further?

Is there any plan to fix this issue and #730 ?

I think @danpovey has something to say about this.

My exp is based on espnet2 rather than snowfall, but it's similar
the batch_bins==2560, or 25.6 secs
I think it's small enough, regular value of this in espnet2 is 102400
even small value is meaningless

danpovey · 2021-05-18T11:08:54Z

Even if you were to fix the indexing error, it would likely to be too large to fit in memory, I think the problem is that your den graph is too large. I think the right way to fix this is either to construct a smaller den graph somehow, or to do the den decoding separately from the num forward-backward with intersect_dense_pruned() which allows you to use a single FSA instead of multiple FSAs. That would also be more suitable since the den graph is too large. Fangjun, at some point we should probably create an option in the training code (or simply an alternative code path) which is efficient when the den graph is inconveniently large.

…

On Tue, May 18, 2021 at 5:53 PM Jarvan-Wang ***@***.***> wrote: After I reduce the batch_size, no overflow at the point above. Could you reduce your --max-duration to reduce the problem size further? Is there any plan to fix this issue and #730 <#730> ? I think @danpovey <https://github.com/danpovey> has something to say about this. My exp is based on espnet2 rather than snowfall, but it's similar the batch_bins==2560, or 25.6 secs I think it's small enough, regular value of this in espnet2 is 102400 even small value is meaningless — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub <#739 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/AAZFLO62IDXCKIAKL6WARATTOI2I3ANCNFSM4433ICVA> .

danpovey · 2021-05-18T11:22:36Z

... another way to reduce the size of the den graph would be to limit it to only seen pairs of phone symbols. But that would require some changes to the code as our P matrix would need to be ragged not square. I'm curious why the den graph is so large in the first place though-- i.e. what the vocabulary size is.

…

On Tue, May 18, 2021 at 7:08 PM Daniel Povey ***@***.***> wrote: Even if you were to fix the indexing error, it would likely to be too large to fit in memory, I think the problem is that your den graph is too large. I think the right way to fix this is either to construct a smaller den graph somehow, or to do the den decoding separately from the num forward-backward with intersect_dense_pruned() which allows you to use a single FSA instead of multiple FSAs. That would also be more suitable since the den graph is too large. Fangjun, at some point we should probably create an option in the training code (or simply an alternative code path) which is efficient when the den graph is inconveniently large. On Tue, May 18, 2021 at 5:53 PM Jarvan-Wang ***@***.***> wrote: > After I reduce the batch_size, no overflow at the point above. > > Could you reduce your --max-duration to reduce the problem size further? > > Is there any plan to fix this issue and #730 > <#730> ? > > I think @danpovey <https://github.com/danpovey> has something to say > about this. > > My exp is based on espnet2 rather than snowfall, but it's similar > the batch_bins==2560, or 25.6 secs > I think it's small enough, regular value of this in espnet2 is 102400 > even small value is meaningless > > — > You are receiving this because you were mentioned. > Reply to this email directly, view it on GitHub > <#739 (comment)>, or > unsubscribe > <https://github.com/notifications/unsubscribe-auth/AAZFLO62IDXCKIAKL6WARATTOI2I3ANCNFSM4433ICVA> > . >

csukuangfj · 2021-05-18T11:38:59Z

Fangjun, at some point we should probably create an option in the training
code (or simply an alternative code path) which is efficient
when the den graph is inconveniently large.

Will fix it in a day or two using intersect_dense_pruned and provide a commandline option to enable/disable it.

danpovey · 2021-05-18T11:51:18Z

Thanks!

…

On Tue, May 18, 2021 at 7:39 PM Fangjun Kuang ***@***.***> wrote: Fangjun, at some point we should probably create an option in the training code (or simply an alternative code path) which is efficient when the den graph is inconveniently large. Will fix it in a day or two using intersect_dense_pruned and provide a commandline option to enable/disable it. — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub <#739 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/AAZFLO4VUIJNVM3QQIV3BKLTOJGWTANCNFSM4433ICVA> .

Jarvan-Wang · 2021-05-19T09:43:59Z

... another way to reduce the size of the den graph would be to limit it to only seen pairs of phone symbols. But that would require some changes to the code as our P matrix would need to be ragged not square. I'm curious why the den graph is so large in the first place though-- i.e. what the vocabulary size is.
…
On Tue, May 18, 2021 at 7:08 PM Daniel Povey @.> wrote: Even if you were to fix the indexing error, it would likely to be too large to fit in memory, I think the problem is that your den graph is too large. I think the right way to fix this is either to construct a smaller den graph somehow, or to do the den decoding separately from the num forward-backward with intersect_dense_pruned() which allows you to use a single FSA instead of multiple FSAs. That would also be more suitable since the den graph is too large. Fangjun, at some point we should probably create an option in the training code (or simply an alternative code path) which is efficient when the den graph is inconveniently large. On Tue, May 18, 2021 at 5:53 PM Jarvan-Wang @.> wrote: > After I reduce the batch_size, no overflow at the point above. > > Could you reduce your --max-duration to reduce the problem size further? > > Is there any plan to fix this issue and #730 > <#730> ? > > I think @danpovey https://github.com/danpovey has something to say > about this. > > My exp is based on espnet2 rather than snowfall, but it's similar > the batch_bins==2560, or 25.6 secs > I think it's small enough, regular value of this in espnet2 is 102400 > even small value is meaningless > > — > You are receiving this because you were mentioned. > Reply to this email directly, view it on GitHub > <#739 (comment)>, or > unsubscribe > https://github.com/notifications/unsubscribe-auth/AAZFLO62IDXCKIAKL6WARATTOI2I3ANCNFSM4433ICVA > . >

this csukuangfj/snowfall@550a6c5 fix works for our syllable level recipe PARTLY(start training successfully, but out of gpu memory after 1400 batches training).
do you have any more optimizing plans to deal with it?

but my character level recipe(about 4k nodes) is still not working(time to give it up)

danpovey · 2021-05-19T10:58:12Z

Possibly reducing pruning beams might help. Stack trace might show something. ... another way to reduce the size of the den graph would be to limit it to

…

only seen pairs of phone symbols. But that would require some changes to the code as our P matrix would need to be ragged not square. I'm curious why the den graph is so large in the first place though-- i.e. what the vocabulary size is. … <#m_4640109050101705941_> On Tue, May 18, 2021 at 7:08 PM Daniel Povey *@*.*> wrote: Even if you were to fix the indexing error, it would likely to be too large to fit in memory, I think the problem is that your den graph is too large. I think the right way to fix this is either to construct a smaller den graph somehow, or to do the den decoding separately from the num forward-backward with intersect_dense_pruned() which allows you to use a single FSA instead of multiple FSAs. That would also be more suitable since the den graph is too large. Fangjun, at some point we should probably create an option in the training code (or simply an alternative code path) which is efficient when the den graph is inconveniently large. On Tue, May 18, 2021 at 5:53 PM Jarvan-Wang @.*> wrote: > After I reduce the batch_size, no overflow at the point above. > > Could you reduce your --max-duration to reduce the problem size further? > > Is there any plan to fix this issue and #730 <#730> > <#730 <#730>> ? > > I think @danpovey <https://github.com/danpovey> https://github.com/danpovey has something to say > about this. > > My exp is based on espnet2 rather than snowfall, but it's similar > the batch_bins==2560, or 25.6 secs > I think it's small enough, regular value of this in espnet2 is 102400 > even small value is meaningless > > — > You are receiving this because you were mentioned. > Reply to this email directly, view it on GitHub > <#739 (comment) <#739 (comment)>>, or > unsubscribe > https://github.com/notifications/unsubscribe-auth/AAZFLO62IDXCKIAKL6WARATTOI2I3ANCNFSM4433ICVA > . > this ***@***.*** <csukuangfj/snowfall@550a6c5> fix works for our syllable level recipe PARTLY(start training successfully, but out of gpu memory after 1400 batches training). do you have any more optimizing plans to deal with it? but my character level recipe(about 4k nodes) is still not working(time to give it up) — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub <#739 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/AAZFLO44ZVJWKNXIBHKUM5TTOOB7NANCNFSM4433ICVA> .

csukuangfj self-assigned this May 18, 2021

csukuangfj added the will-fix label May 18, 2021

csukuangfj mentioned this issue May 19, 2021

Support using k2.intersect_dense_pruned in computing LF-MMI loss. k2-fsa/snowfall#199

Merged

Jarvan-Wang closed this as completed Jun 2, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

A bug when index num_den_graphs at LFMMILoss.forward #739

A bug when index num_den_graphs at LFMMILoss.forward #739

Jarvan-Wang commented May 14, 2021

Jarvan-Wang commented May 18, 2021

csukuangfj commented May 18, 2021

Jarvan-Wang commented May 18, 2021

danpovey commented May 18, 2021 via email

danpovey commented May 18, 2021 via email

csukuangfj commented May 18, 2021

danpovey commented May 18, 2021 via email

Jarvan-Wang commented May 19, 2021

danpovey commented May 19, 2021 via email

A bug when index num_den_graphs at LFMMILoss.forward #739

A bug when index num_den_graphs at LFMMILoss.forward #739

Comments

Jarvan-Wang commented May 14, 2021

Jarvan-Wang commented May 18, 2021

csukuangfj commented May 18, 2021

Jarvan-Wang commented May 18, 2021

danpovey commented May 18, 2021 via email

danpovey commented May 18, 2021 via email

csukuangfj commented May 18, 2021

danpovey commented May 18, 2021 via email

Jarvan-Wang commented May 19, 2021

danpovey commented May 19, 2021 via email