Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

From Linux bash script to Python control flow #386

Merged
merged 25 commits into from
Jul 11, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
e417791
Remove the linux bash script, replaced with a `docker exec` approach
joerick Jun 23, 2020
53c00a0
Use a long-running remote bash shell, rather than `docker exec`
joerick Jun 24, 2020
31339ba
Extract DockerContainer into its own file
joerick Jun 24, 2020
a052c71
Remove hard-coded shell info to use the docker ENTRYPOINT instead
joerick Jun 24, 2020
850d109
Don't use `docker exec -w` because CircleCI doesn't support it
joerick Jun 24, 2020
dd4fc69
Revert "Remove hard-coded shell info to use the docker ENTRYPOINT ins…
joerick Jun 25, 2020
fefe128
Fix issues with spaces in environment variables
joerick Jun 25, 2020
68d8f89
Style tweaks
joerick Jun 26, 2020
1d93336
Explicit buffer size isn't necessary
joerick Jun 26, 2020
df69dda
Fix broken PATH test - it was erroring for a different reason
joerick Jun 26, 2020
2edb336
Import organisation
joerick Jun 26, 2020
762dc80
Convert DockerContainer to use binary PIPEs, to handle arbitrary data
joerick Jun 26, 2020
c956f11
Add unit tests for DockerContainer
joerick Jun 26, 2020
ad871f8
Fix style
joerick Jun 26, 2020
91c3e90
Fix Python 3.6 compatibility
joerick Jun 26, 2020
faf6a76
Change 'slow' tests to 'docker' tests, only run them on Linux
joerick Jun 28, 2020
27d97da
Don't use redirection syntax in Windows environment. Fix absolute pat…
joerick Jun 28, 2020
e31c24d
Fix overridden PATH test, by ensuring bash command substitution rules
joerick Jun 28, 2020
b12a560
Add docker test images for other architectures
joerick Jul 1, 2020
cc913ee
Use a different method for the Windows test
joerick Jul 5, 2020
833a069
Pass PATH during command substitution test
joerick Jul 5, 2020
9e35d65
Add file-based unit tests
joerick Jul 8, 2020
4c0af9f
Finish file/dir tests
joerick Jul 10, 2020
44e3198
Convert glob to take a Path and a str pattern
joerick Jul 10, 2020
5842a75
Flake8 fixes
joerick Jul 10, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions cibuildwheel/bashlex_eval.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
import shlex
import subprocess

from typing import Dict, NamedTuple
from typing import Dict, NamedTuple, Callable, Optional

import bashlex # type: ignore


# a function that takes a shell command and the environment, and returns the result
EnvironmentExecutor = Callable[[str, Dict[str, str]], str]


def local_environment_executor(command: str, env: Dict[str, str]) -> str:
return subprocess.check_output(shlex.split(command), env=env, universal_newlines=True)


class NodeExecutionContext(NamedTuple):
environment: Dict[str, str]
input: str
executor: EnvironmentExecutor


def evaluate(value: str, environment: Dict[str, str]) -> str:
def evaluate(value: str, environment: Dict[str, str], executor: Optional[EnvironmentExecutor] = None) -> str:
if not value:
# empty string evaluates to empty string
# (but trips up bashlex)
Expand All @@ -26,7 +35,7 @@ def evaluate(value: str, environment: Dict[str, str]) -> str:

return evaluate_node(
value_word_node,
context=NodeExecutionContext(environment=environment, input=value)
context=NodeExecutionContext(environment=environment, input=value, executor=executor or local_environment_executor)
)


Expand Down Expand Up @@ -67,7 +76,7 @@ def evaluate_word_node(node: bashlex.ast.node, context: NodeExecutionContext) ->
def evaluate_command_node(node: bashlex.ast.node, context: NodeExecutionContext) -> str:
words = [evaluate_node(part, context=context) for part in node.parts]
command = ' '.join(words)
return subprocess.check_output(shlex.split(command), env=context.environment, universal_newlines=True)
return context.executor(command, context.environment)


def evaluate_parameter_node(node: bashlex.ast.node, context: NodeExecutionContext) -> str:
Expand Down
178 changes: 178 additions & 0 deletions cibuildwheel/docker_container.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import io
import json
import shlex
import subprocess
import sys
import uuid
from os import PathLike
from pathlib import Path, PurePath
from typing import IO, Dict, List, Optional, Sequence, TextIO, Union


class DockerContainer:
'''
An object that represents a running Docker container.

Intended for use as a context manager e.g.
`with DockerContainer('ubuntu') as docker:`

A bash shell is running in the remote container. When `call()` is invoked,
the command is relayed to the remote shell, and the results are streamed
back to cibuildwheel.
'''
UTILITY_PYTHON = '/opt/python/cp38-cp38/bin/python'

process: subprocess.Popen
bash_stdin: IO[str]
bash_stdout: IO[str]

def __init__(self, docker_image: str, simulate_32_bit=False):
self.docker_image = docker_image
self.simulate_32_bit = simulate_32_bit

def __enter__(self) -> 'DockerContainer':
self.container_name = f'cibuildwheel-{uuid.uuid4()}'
shell_args = ['linux32', '/bin/bash'] if self.simulate_32_bit else ['/bin/bash']
subprocess.run(
[
'docker', 'create',
'--env', 'CIBUILDWHEEL',
'--name', self.container_name,
'-i',
'-v', '/:/host', # ignored on CircleCI
self.docker_image,
*shell_args
],
check=True,
)
process = subprocess.Popen(
[
'docker', 'start',
'--attach', '--interactive',
self.container_name,
],
encoding='utf8',
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
# make the input buffer large enough to carry a lot of environment
# variables. We choose 256kB.
bufsize=262144,
)
self.process = process
assert process.stdin and process.stdout
self.bash_stdin = process.stdin
self.bash_stdout = process.stdout
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self.bash_stdin.close()
self.process.terminate()
self.process.wait()

subprocess.run(['docker', 'rm', '--force', '-v', self.container_name])
self.container_name = None

def copy_into(self, from_path: Path, to_path: PurePath) -> None:
# `docker cp` causes 'no space left on device' error when
# a container is running and the host filesystem is
# mounted. https://github.com/moby/moby/issues/38995
# Use `docker exec` instead.
if from_path.is_dir():
self.call(['mkdir', '-p', to_path])
subprocess.run(
f'tar cf - . | docker exec -i {self.container_name} tar -xC {to_path} -f -',
shell=True,
check=True,
cwd=from_path)
else:
subprocess.run(
f'cat {from_path} | docker exec -i {self.container_name} sh -c "cat > {to_path}"',
shell=True,
check=True)

def copy_out(self, from_path: PurePath, to_path: Path) -> None:
# note: we assume from_path is a dir
to_path.mkdir(parents=True, exist_ok=True)

subprocess.run(
f'docker exec -i {self.container_name} tar -cC {from_path} -f - . | tar -xf -',
shell=True,
check=True,
cwd=to_path
)

def glob(self, pattern: PurePath) -> List[PurePath]:
path_strs = json.loads(self.call([
self.UTILITY_PYTHON,
'-c',
f'import sys, json, glob; json.dump(glob.glob({str(pattern)!r}), sys.stdout)'
], capture_output=True))

return [PurePath(p) for p in path_strs]

def call(self, args: Sequence[Union[str, PathLike]], env: Dict[str, str] = {},
capture_output=False, cwd: Optional[Union[str, PathLike]] = None) -> str:
chdir = f'cd {cwd}' if cwd else ''
env_assignments = ' '.join(f'{shlex.quote(k)}={shlex.quote(v)}'
for k, v in env.items())
command = ' '.join(shlex.quote(str(a)) for a in args)
end_of_message = str(uuid.uuid4())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we somehow use \0 to mark the end of a message? To be fair, I don't know how bash handles this.

At any rate, do we need to generate a UUID each time? I'd think we could just take a fixed string marker?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's nothing stopping a child process from outputting \0 and causing havoc.

Normally, when doing this kind of protocol, there'd be a header that specifies the length of the following message -that makes reads really easy. But I wanted to avoid buffering the output on the container side, for memory reasons but mostly so that output still streamed back to the console - during a long build this is very reassuring, or helpful if a command stalls. This solution was inspired by SMTP multipart boundaries - basically a random string long enough is unique enough to delimit between parts of data. UUID is just an easy way to make a unique string.

I could add some of this context as a comment I guess?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But I wanted to avoid buffering the output on the container side, for memory reasons but mostly so that output still streamed back to the console - during a long build this is very reassuring, or helpful if a command stalls.

Alright, yes. Good point! I hadn't considered that, yet, but this is an important feature, then! :-)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could still just define our own constant magic end-of-message UUID, no? Is there a reason to introduce randomness and regenerate a new one every time?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there's something comforting about using a new UUID each time - it's a string that's never existed in the world before. but practically, err, the most concrete I can get with it is that if it was static and written in code, and then somehow this code file was sent down the pipe, then the terminator would appear before the end of the message. Far-fetched, but that's the kind of thing I'm thinking about. But a fresh UUID has never existed before, so that kind of thing couldn't happen.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a bit far-fetched, but it had completely not crossed my mind! Better safe than sorry, then :-)


# log the command we're executing
print(f' + {command}')

# Write a command to the remote shell. First we change the
# cwd, if that's required. Then, we use the `env` utility to run
# `command` inside the specified environment. We use `env` because it
# can cope with spaces and strange characters in the name or value.
# Finally, the remote shell is told to write a footer - this will show
# up in the output so we know when to stop reading, and will include
# the returncode of `command`.
self.bash_stdin.write(f'''(
{chdir}
env {env_assignments} {command}
printf "%04d%s\n" $? {end_of_message}
)
''')
self.bash_stdin.flush()

if capture_output:
output_io: TextIO = io.StringIO()
else:
output_io = sys.stdout

while True:
line = self.bash_stdout.readline()

if line.endswith(end_of_message+'\n'):
footer_offset = (
len(line)
- 1 # newline character
- len(end_of_message) # delimiter
- 4 # 4 returncode decimals
)
returncode_str = line[footer_offset:footer_offset+4]
returncode = int(returncode_str)
# add the last line to output, without the footer
output_io.write(line[0:footer_offset])
break
else:
output_io.write(line)

output = output_io.getvalue() if isinstance(output_io, io.StringIO) else None

if returncode != 0:
raise subprocess.CalledProcessError(returncode, args, output)

return output if output else ''

def get_environment(self) -> Dict[str, str]:
return json.loads(self.call([
self.UTILITY_PYTHON,
'-c',
'import sys, json, os; json.dump(os.environ.copy(), sys.stdout)'
], capture_output=True))

def environment_executor(self, command: str, environment: Dict[str, str]) -> str:
# used as an EnvironmentExecutor to evaluate commands and capture output
return self.call(shlex.split(command), env=environment)
12 changes: 7 additions & 5 deletions cibuildwheel/environment.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import bashlex # type: ignore

from typing import Dict, List, Mapping
from typing import Dict, List, Mapping, Optional

from . import bashlex_eval

Expand Down Expand Up @@ -46,9 +46,9 @@ def __init__(self, assignment: str):
self.name = name
self.value = value

def evaluated_value(self, environment: Dict[str, str]) -> str:
def evaluated_value(self, environment: Dict[str, str], executor: Optional[bashlex_eval.EnvironmentExecutor] = None) -> str:
'''Returns the value of this assignment, as evaluated in the environment'''
return bashlex_eval.evaluate(self.value, environment=environment)
return bashlex_eval.evaluate(self.value, environment=environment, executor=executor)

def as_shell_assignment(self) -> str:
return f'export {self.name}={self.value}'
Expand All @@ -61,11 +61,13 @@ class ParsedEnvironment:
def __init__(self, assignments: List[EnvironmentAssignment]):
self.assignments = assignments

def as_dictionary(self, prev_environment: Mapping[str, str]) -> Dict[str, str]:
def as_dictionary(self,
prev_environment: Mapping[str, str],
executor: Optional[bashlex_eval.EnvironmentExecutor] = None) -> Dict[str, str]:
environment = dict(**prev_environment)

for assignment in self.assignments:
value = assignment.evaluated_value(environment=environment)
value = assignment.evaluated_value(environment=environment, executor=executor)
environment[assignment.name] = value

return environment
Expand Down
Loading