Added parallel blob enumeration (#1045)

zezha-msft · web-flow · commit 41024fe532f4 · 2020-07-02T00:41:43.000-07:00
* Added parallel blob enumeration

* Added numOfFolders flag to benchmark commmand

* Address comment about order

* Fix bug surfaced by rebase

* Added tool script for measuring performance

* Address comments
diff --git a/cmd/benchmark.go b/cmd/benchmark.go
@@ -44,6 +44,7 @@ type rawBenchmarkCmdArgs struct {
 	sizePerFile    string
 	fileCount      uint
 	deleteTestData bool
+	numOfFolders   uint
 
 	// options from flags
 	blockSizeMB  float64
@@ -137,7 +138,7 @@ func (raw rawBenchmarkCmdArgs) cook() (cookedCopyCmdArgs, error) {
 		c.src = raw.target
 	} else { // Upload
 		// src must be string, but needs to indicate that its for benchmark and encode what we want
-		c.src = benchmarkSourceHelper{}.ToUrl(raw.fileCount, bytesPerFile)
+		c.src = benchmarkSourceHelper{}.ToUrl(raw.fileCount, bytesPerFile, raw.numOfFolders)
 		c.dst, err = raw.appendVirtualDir(raw.target, virtualDir)
 		if err != nil {
 			return dummyCooked, err
@@ -253,35 +254,41 @@ type benchmarkSourceHelper struct{}
 // you want a URL that can't possibly be a real one, so we'll use that
 const benchmarkSourceHost = "benchmark.invalid"
 
-func (h benchmarkSourceHelper) ToUrl(fileCount uint, bytesPerFile int64) string {
-	return fmt.Sprintf("https://%s?fc=%d&bpf=%d", benchmarkSourceHost, fileCount, bytesPerFile)
+func (h benchmarkSourceHelper) ToUrl(fileCount uint, bytesPerFile int64, numOfFolders uint) string {
+	return fmt.Sprintf("https://%s?fc=%d&bpf=%d&nf=%d", benchmarkSourceHost, fileCount, bytesPerFile, numOfFolders)
 }
 
-func (h benchmarkSourceHelper) FromUrl(s string) (fileCount uint, bytesPerFile int64, err error) {
+func (h benchmarkSourceHelper) FromUrl(s string) (fileCount uint, bytesPerFile int64, numOfFolders uint, err error) {
 	// TODO: consider replace with regex?
 
 	expectedPrefix := "https://" + benchmarkSourceHost + "?"
 	if !strings.HasPrefix(s, expectedPrefix) {
-		return 0, 0, errors.New("invalid benchmark source string")
+		return 0, 0, 0, errors.New("invalid benchmark source string")
 	}
 	s = strings.TrimPrefix(s, expectedPrefix)
 	pieces := strings.Split(s, "&")
-	if len(pieces) != 2 ||
+	if len(pieces) != 3 ||
 		!strings.HasPrefix(pieces[0], "fc=") ||
-		!strings.HasPrefix(pieces[1], "bpf=") {
-		return 0, 0, errors.New("invalid benchmark source string")
+		!strings.HasPrefix(pieces[1], "bpf=") ||
+		!strings.HasPrefix(pieces[2], "nf=") {
+		return 0, 0, 0, errors.New("invalid benchmark source string")
 	}
 	pieces[0] = strings.Split(pieces[0], "=")[1]
 	pieces[1] = strings.Split(pieces[1], "=")[1]
+	pieces[2] = strings.Split(pieces[2], "=")[1]
 	fc, err := strconv.ParseUint(pieces[0], 10, 64)
 	if err != nil {
-		return 0, 0, err
+		return 0, 0, 0, err
 	}
 	bpf, err := strconv.ParseInt(pieces[1], 10, 64)
 	if err != nil {
-		return 0, 0, err
+		return 0, 0, 0, err
 	}
-	return uint(fc), bpf, nil
+	nf, err := strconv.ParseUint(pieces[2], 10, 64)
+	if err != nil {
+		return 0, 0, 0, err
+	}
+	return uint(fc), bpf, uint(nf), nil
 }
 
 var benchCmd *cobra.Command
@@ -330,6 +337,7 @@ func init() {
 
 	benchCmd.PersistentFlags().StringVar(&raw.sizePerFile, common.SizePerFileParam, "250M", "size of each auto-generated data file. Must be "+sizeStringDescription)
 	benchCmd.PersistentFlags().UintVar(&raw.fileCount, common.FileCountParam, common.FileCountDefault, "number of auto-generated data files to use")
+	benchCmd.PersistentFlags().UintVar(&raw.numOfFolders, "number-of-folders", 0, "If larger than 0, create folders to divide up the data.")
 	benchCmd.PersistentFlags().BoolVar(&raw.deleteTestData, "delete-test-data", true, "if true, the benchmark data will be deleted at the end of the benchmark run.  Set it to false if you want to keep the data at the destination - e.g. to use it for manual tests outside benchmark mode")
 
 	benchCmd.PersistentFlags().Float64Var(&raw.blockSizeMB, "block-size-mb", 0, "use this block size (specified in MiB). Default is automatically calculated based on file size. Decimal fractions are allowed - e.g. 0.25. Identical to the same-named parameter in the copy command")
diff --git a/cmd/zc_traverser_benchmark.go b/cmd/zc_traverser_benchmark.go
@@ -28,17 +28,19 @@ import (
 type benchmarkTraverser struct {
 	fileCount                   uint
 	bytesPerFile                int64
+	numOfFolders                uint
 	incrementEnumerationCounter enumerationCounterFunc
 }
 
 func newBenchmarkTraverser(source string, incrementEnumerationCounter enumerationCounterFunc) (*benchmarkTraverser, error) {
-	fc, bpf, err := benchmarkSourceHelper{}.FromUrl(source)
+	fc, bpf, nf, err := benchmarkSourceHelper{}.FromUrl(source)
 	if err != nil {
 		return nil, err
 	}
 	return &benchmarkTraverser{
 			fileCount:                   fc,
 			bytesPerFile:                bpf,
+			numOfFolders:                nf,
 			incrementEnumerationCounter: incrementEnumerationCounter},
 		nil
 }
@@ -69,6 +71,11 @@ func (t *benchmarkTraverser) traverse(preprocessor objectMorpher, processor obje
 		name := t.toReversedString(i) // this gives an even distribution through the namespace (compare the starting characters, for 0 to 199, when reversed or not). This is useful for performance when High Throughput Block Blob pathway does not apply
 		relativePath := name
 
+		if t.numOfFolders > 0 {
+			assignedFolder := t.toReversedString(i % t.numOfFolders)
+			relativePath = assignedFolder + common.AZCOPY_PATH_SEPARATOR_STRING + relativePath
+		}
+
 		if t.incrementEnumerationCounter != nil {
 			t.incrementEnumerationCounter(common.EEntityType.File())
 		}
diff --git a/cmd/zc_traverser_blob.go b/cmd/zc_traverser_blob.go
@@ -23,6 +23,7 @@ package cmd
 import (
 	"context"
 	"fmt"
+	"github.com/Azure/azure-storage-azcopy/common/parallel"
 	"net/url"
 	"strings"
 
@@ -160,57 +161,74 @@ func (t *blobTraverser) traverse(preprocessor objectMorpher, processor objectPro
 	// as a performance optimization, get an extra prefix to do pre-filtering. It's typically the start portion of a blob name.
 	extraSearchPrefix := filterSet(filters).GetEnumerationPreFilter(t.recursive)
 
-	for marker := (azblob.Marker{}); marker.NotDone(); {
-
-		// see the TO DO in GetEnumerationPreFilter if/when we make this more directory-aware
+	// Define how to enumerate its contents
+	// This func must be thread safe/goroutine safe
+	enumerateOneDir := func(dir parallel.Directory, enqueueDir func(parallel.Directory), enqueueOutput func(parallel.DirectoryEntry, error)) error {
+		currentDirPath := dir.(string)
+		for marker := (azblob.Marker{}); marker.NotDone(); {
+			lResp, err := containerURL.ListBlobsHierarchySegment(t.ctx, marker, "/", azblob.ListBlobsSegmentOptions{Prefix: currentDirPath,
+				Details: azblob.BlobListingDetails{Metadata: true}})
+			if err != nil {
+				return fmt.Errorf("cannot list files due to reason %s", err)
+			}
 
-		// look for all blobs that start with the prefix
-		// TODO optimize for the case where recursive is off
-		listBlob, err := containerURL.ListBlobsFlatSegment(t.ctx, marker,
-			azblob.ListBlobsSegmentOptions{Prefix: searchPrefix + extraSearchPrefix, Details: azblob.BlobListingDetails{Metadata: true}})
-		if err != nil {
-			return fmt.Errorf("cannot list blobs. Failed with error %s", err.Error())
-		}
+			// queue up the sub virtual directories if recursive is true
+			if t.recursive {
+				for _, virtualDir := range lResp.Segment.BlobPrefixes {
+					enqueueDir(virtualDir.Name)
+				}
+			}
 
-		// process the blobs returned in this result segment
-		for _, blobInfo := range listBlob.Segment.BlobItems {
-			// if the blob represents a hdi folder, then skip it
-			if util.doesBlobRepresentAFolder(blobInfo.Metadata) && !(t.includeDirectoryStubs && t.recursive) {
-				continue
+			// process the blobs returned in this result segment
+			for _, blobInfo := range lResp.Segment.BlobItems {
+				// if the blob represents a hdi folder, then skip it
+				if util.doesBlobRepresentAFolder(blobInfo.Metadata) && !(t.includeDirectoryStubs && t.recursive) {
+					continue
+				}
+
+				relativePath := strings.TrimPrefix(blobInfo.Name, searchPrefix)
+				adapter := blobPropertiesAdapter{blobInfo.Properties}
+				storedObject := newStoredObject(
+					preprocessor,
+					getObjectNameOnly(blobInfo.Name),
+					relativePath,
+					common.EEntityType.File(),
+					blobInfo.Properties.LastModified,
+					*blobInfo.Properties.ContentLength,
+					adapter,
+					adapter, // adapter satisfies both interfaces
+					common.FromAzBlobMetadataToCommonMetadata(blobInfo.Metadata),
+					blobUrlParts.ContainerName,
+				)
+				enqueueOutput(storedObject, nil)
 			}
 
-			relativePath := strings.TrimPrefix(blobInfo.Name, searchPrefix)
+			marker = lResp.NextMarker
+		}
+		return nil
+	}
 
-			// if recursive
-			if !t.recursive && strings.Contains(relativePath, common.AZCOPY_PATH_SEPARATOR_STRING) {
-				continue
-			}
+	// initiate parallel scanning, starting at the root path
+	workerContext, cancelWorkers := context.WithCancel(t.ctx)
+	cCrawled := parallel.Crawl(workerContext, searchPrefix+extraSearchPrefix, enumerateOneDir, enumerationParallelism)
 
-			adapter := blobPropertiesAdapter{blobInfo.Properties}
-			storedObject := newStoredObject(
-				preprocessor,
-				getObjectNameOnly(blobInfo.Name),
-				relativePath,
-				common.EEntityType.File(),
-				blobInfo.Properties.LastModified,
-				*blobInfo.Properties.ContentLength,
-				adapter,
-				adapter, // adapter satisfies both interfaces
-				common.FromAzBlobMetadataToCommonMetadata(blobInfo.Metadata),
-				blobUrlParts.ContainerName,
-			)
-
-			if t.incrementEnumerationCounter != nil {
-				t.incrementEnumerationCounter(common.EEntityType.File())
-			}
+	for x := range cCrawled {
+		item, workerError := x.Item()
+		if workerError != nil {
+			cancelWorkers()
+			return workerError
+		}
 
-			processErr := processIfPassedFilters(filters, storedObject, processor)
-			if processErr != nil {
-				return processErr
-			}
+		if t.incrementEnumerationCounter != nil {
+			t.incrementEnumerationCounter(common.EEntityType.File())
 		}
 
-		marker = listBlob.NextMarker
+		object := item.(storedObject)
+		processErr := processIfPassedFilters(filters, object, processor)
+		if processErr != nil {
+			cancelWorkers()
+			return processErr
+		}
 	}
 
 	return
diff --git a/cmd/zt_remove_blob_test.go b/cmd/zt_remove_blob_test.go
@@ -412,7 +412,7 @@ func (s *cmdIntegrationSuite) TestRemoveBlobsWithDirectoryStubs(c *chk.C) {
 	runCopyAndVerify(c, raw, func(err error) {
 		c.Assert(err, chk.IsNil)
 
-		// there should be exactly 20 top files, no directory stubs should included
+		// there should be exactly 20 top files, no directory stubs should be included
 		c.Assert(len(mockedRPC.transfers), chk.Equals, 20)
 
 		for _, transfer := range mockedRPC.transfers {
diff --git a/tool_test_perf.sh b/tool_test_perf.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# this script provides a quick way to validate whether a change causes a performance gain versus regression
+
+RunCurrent () {
+  ./azcopy_current cp 'src' 'dst' --recursive --log-level=WARNING --check-length=false
+}
+
+RunNew () {
+  ./azcopy_new cp 'src' 'dst' --recursive --log-level=WARNING --check-length=false
+}
+
+export AZCOPY_LOG_LOCATION=/datadrive/logs
+export AZCOPY_JOB_PLAN_LOCATION=/datadrive/plans
+
+for i in {1..5}
+do
+  # start with the new version, which might give it a disadvantage
+  # but since we run the experiment repeately, the results average out eventually
+  # in addition, it's better to give the disadvantage to the new version so that we can be extra sure that it's better/equal to current
+  echo Running new version for "$i"th time >> cmd-output.txt
+
+  # keep the result of the run, in case any error occurs
+  start_time=$(date +%s)
+  RunNew >> cmd-output.txt
+  end_time=$(date +%s)
+
+  # insert a record into the result CSV file
+  # this format is used so that we can import it easily into Excel
+  echo $i, new, $(expr "$end_time" - "$start_time") >> result.csv
+
+  # run the current version immedietely after
+  echo Running current version for "$i"th time >> cmd-output.txt
+
+  # do the same for current version
+  start_time=$(date +%s)
+  RunCurrent >> cmd-output.txt
+  end_time=$(date +%s)
+  echo $i, current, $(expr "$end_time" - "$start_time") >> result.csv
+done